summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/CMakeLists.txt7
-rw-r--r--src/lib/config.h.in25
-rw-r--r--src/lib/crc/crc16.c61
-rw-r--r--src/lib/crc/crc32.c (renamed from src/lib/crc32.c)0
-rw-r--r--src/lib/crc/crc64.c363
-rw-r--r--src/lib/crc/crc8.c62
-rw-r--r--src/lib/crc/tests/CMakeLists.txt21
-rw-r--r--src/lib/crc/tests/crc16_test.c67
-rw-r--r--src/lib/crc/tests/crc32_test.c (renamed from src/lib/tests/crc32_test.c)0
-rw-r--r--src/lib/crc/tests/crc64_test.c126
-rw-r--r--src/lib/crc/tests/crc8_test.c67
-rw-r--r--src/lib/crypt.c7
-rw-r--r--src/lib/crypt/openssl.c44
-rw-r--r--src/lib/crypt/openssl.h2
-rw-r--r--src/lib/dev.c1329
-rw-r--r--src/lib/frct.c4245
-rw-r--r--src/lib/hash.c37
-rw-r--r--src/lib/irm.c2
-rw-r--r--src/lib/pb/ipcp.proto2
-rw-r--r--src/lib/pb/irm.proto6
-rw-r--r--src/lib/pb/model.proto5
-rw-r--r--src/lib/protobuf.c10
-rw-r--r--src/lib/qoscube.c12
-rw-r--r--src/lib/random.c7
-rw-r--r--src/lib/rib.c18
-rw-r--r--src/lib/ssm/flow_set.c24
-rw-r--r--src/lib/ssm/pool.c82
-rw-r--r--src/lib/ssm/rbuff.c60
-rw-r--r--src/lib/ssm/ssm.h.in19
-rw-r--r--src/lib/ssm/tests/pool_sharding_test.c69
-rw-r--r--src/lib/ssm/tests/pool_test.c12
-rw-r--r--src/lib/tests/CMakeLists.txt2
-rw-r--r--src/lib/tests/auth_test.c55
-rw-r--r--src/lib/tests/hash_test.c110
-rw-r--r--src/lib/tests/kex_test.c14
-rw-r--r--src/lib/tests/kex_test_ml_kem.c18
-rw-r--r--src/lib/tests/tpm_test.c2
-rw-r--r--src/lib/tests/tw_test.c663
-rw-r--r--src/lib/timerwheel.c414
-rw-r--r--src/lib/tw.c307
40 files changed, 6792 insertions, 1584 deletions
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 79263924..6cd3a8a4 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -17,7 +17,10 @@ protobuf_generate_c(IPCP_PROTO_SRCS IPCP_PROTO_HDRS
set(SOURCE_FILES_COMMON
bitmap.c
btree.c
- crc32.c
+ crc/crc8.c
+ crc/crc16.c
+ crc/crc32.c
+ crc/crc64.c
crypt.c
hash.c
lockfile.c
@@ -36,6 +39,7 @@ set(SOURCE_FILES_COMMON
ssm/pool.c
sockets.c
tpm.c
+ tw.c
utils.c
)
@@ -155,5 +159,6 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/ssm/ssm.h.in"
if(BUILD_TESTS)
add_subdirectory(tests)
+ add_subdirectory(crc/tests)
add_subdirectory(ssm/tests)
endif()
diff --git a/src/lib/config.h.in b/src/lib/config.h.in
index 08e9baf6..7124a974 100644
--- a/src/lib/config.h.in
+++ b/src/lib/config.h.in
@@ -20,6 +20,14 @@
* Foundation, Inc., http://www.fsf.org/about/contact/.
*/
+#ifndef MILLION
+#define MILLION 1000000LL
+#endif
+
+#ifndef BILLION
+#define BILLION 1000000000LL
+#endif
+
#cmakedefine HAVE_SYS_RANDOM
#cmakedefine HAVE_EXPLICIT_BZERO
#cmakedefine HAVE_LIBGCRYPT
@@ -37,6 +45,8 @@
#cmakedefine QOS_DISABLE_CRC
#cmakedefine HAVE_OPENSSL_RNG
+#cmakedefine HAVE_PCLMUL
+#cmakedefine HAVE_PMULL
#define SHM_LOCKFILE_NAME "@SHM_LOCKFILE_NAME@"
#define FLOW_ALLOC_TIMEOUT @FLOW_ALLOC_TIMEOUT@
@@ -60,16 +70,18 @@
#cmakedefine PROC_FLOW_STATS
#endif
+#cmakedefine FRCT_DEBUG_STDOUT
+
#define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@
-#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@
-#define PROG_RES_FDS @PROG_RES_FDS@
-#define PROG_MAX_FQUEUES @PROG_MAX_FQUEUES@
+#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@
+#define PROC_RES_FDS @PROC_RES_FDS@
+#define PROC_MAX_FQUEUES @PROC_MAX_FQUEUES@
/* Default Delta-t parameters */
#cmakedefine FRCT_LINUX_RTT_ESTIMATOR
-#define DELT_A (@DELTA_T_ACK@) /* ns */
-#define DELT_R (@DELTA_T_RTX@) /* ns */
+#define DELT_A (@DELTA_T_ACK@) /* ms */
+#define DELT_R (@DELTA_T_RTX@) /* ms */
#define RQ_SIZE (@FRCT_REORDER_QUEUE_SIZE@)
#define START_WINDOW (@FRCT_START_WINDOW@)
@@ -80,9 +92,6 @@
#define TICTIME (@FRCT_TICK_TIME@ * 1000) /* ns */
/* Retransmission tuning */
-#cmakedefine RXM_BUFFER_ON_HEAP
-#cmakedefine RXM_BLOCKING
-
#define RXMQ_RES (@RXM_MIN_RESOLUTION@) /* 2^N ns */
#define RXMQ_BUMP (@RXM_WHEEL_MULTIPLIER@)
#define RXMQ_LVLS (@RXM_WHEEL_LEVELS@)
diff --git a/src/lib/crc/crc16.c b/src/lib/crc/crc16.c
new file mode 100644
index 00000000..9dc59429
--- /dev/null
+++ b/src/lib/crc/crc16.c
@@ -0,0 +1,61 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-16/CCITT-FALSE (reveng catalog, alias CRC-16/IBM-3740):
+ * poly = 0x1021
+ * init = 0xffff
+ * refin = false
+ * refout = false
+ * xorout = 0x0000
+ * check = crc16_ccitt_false("123456789") == 0x29b1
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc16.h>
+
+/* Bit-by-bit MSB-first CRC. */
+void crc16_ccitt_false(uint16_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint16_t c;
+ size_t n;
+ int i;
+
+ p = (const uint8_t *) buf;
+ c = *crc ^ 0xffff;
+
+ for (n = 0; n < len; n++) {
+ c ^= ((uint16_t) p[n]) << 8;
+ for (i = 0; i < 8; i++) {
+ if (c & 0x8000)
+ c = (uint16_t) ((c << 1) ^ 0x1021);
+ else
+ c = (uint16_t) (c << 1);
+ }
+ }
+
+ *crc = c;
+}
diff --git a/src/lib/crc32.c b/src/lib/crc/crc32.c
index 0fdb62b1..0fdb62b1 100644
--- a/src/lib/crc32.c
+++ b/src/lib/crc/crc32.c
diff --git a/src/lib/crc/crc64.c b/src/lib/crc/crc64.c
new file mode 100644
index 00000000..1b6fb5f6
--- /dev/null
+++ b/src/lib/crc/crc64.c
@@ -0,0 +1,363 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 64-bit Cyclic Redundancy Check (NVMe variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-64/NVMe (reveng catalog):
+ * poly = 0xad93d23594c93659
+ * init = 0xffffffffffffffff
+ * refin = true
+ * refout = true
+ * xorout = 0xffffffffffffffff
+ * check = crc64_nvme("123456789") == 0xae8b14860a799888
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc64.h>
+
+/*
+ * Reflected CRC-64/NVMe table. Polynomial in reflected form:
+ * 0x9a6c9329ac4bc9b5 (bitrev of 0xad93d23594c93659).
+ */
+static const uint64_t crc64_nvme_tab[256] = {
+ 0x0000000000000000ULL, 0x7f6ef0c830358979ULL,
+ 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL,
+ 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL,
+ 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL,
+ 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL,
+ 0x58c10d24087fec87ULL, 0x27affdec384a65feULL,
+ 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL,
+ 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL,
+ 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL,
+ 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL,
+ 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL,
+ 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL,
+ 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL,
+ 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL,
+ 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL,
+ 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL,
+ 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL,
+ 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL,
+ 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL,
+ 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL,
+ 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL,
+ 0xa900f35319033385ULL, 0xd66e039b2936bafcULL,
+ 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL,
+ 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL,
+ 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL,
+ 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL,
+ 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL,
+ 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL,
+ 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL,
+ 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL,
+ 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL,
+ 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL,
+ 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL,
+ 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL,
+ 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL,
+ 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL,
+ 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL,
+ 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL,
+ 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL,
+ 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL,
+ 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL,
+ 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL,
+ 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL,
+ 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL,
+ 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL,
+ 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL,
+ 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL,
+ 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL,
+ 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL,
+ 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL,
+ 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL,
+ 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL,
+ 0x8087c87e03060c18ULL, 0xffe938b633338561ULL,
+ 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL,
+ 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL,
+ 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL,
+ 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL,
+ 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL,
+ 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL,
+ 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL,
+ 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL,
+ 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL,
+ 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL,
+ 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL,
+ 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL,
+ 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL,
+ 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL,
+ 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL,
+ 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL,
+ 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL,
+ 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL,
+ 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL,
+ 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL,
+ 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL,
+ 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL,
+ 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL,
+ 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL,
+ 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL,
+ 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL,
+ 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL,
+ 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL,
+ 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL,
+ 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL,
+ 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL,
+ 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL,
+ 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL,
+ 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL,
+ 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL,
+ 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL,
+ 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL,
+ 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL,
+ 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL,
+ 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL,
+ 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL,
+ 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL,
+ 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL,
+ 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL,
+ 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL,
+ 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL,
+ 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL,
+ 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL,
+ 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL,
+ 0x224840532670ac20ULL, 0x5d26b09b16452559ULL,
+ 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL,
+ 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL,
+ 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL,
+ 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL,
+ 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL,
+ 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL,
+ 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL,
+ 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL,
+ 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL,
+ 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL,
+ 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL,
+ 0x759552905f188d57ULL, 0x0afba2586f2d042eULL,
+ 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL,
+ 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL,
+ 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL,
+ 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL,
+ 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL,
+ 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL,
+ 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL,
+ 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL,
+ 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL,
+ 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL,
+ 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL,
+ 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL,
+ 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL
+};
+
+static __inline__ uint64_t crc64_nvme_step(uint64_t c,
+ const uint8_t * p,
+ size_t len)
+{
+ size_t n;
+
+ for (n = 0; n < len; n++)
+ c = crc64_nvme_tab[(c ^ p[n]) & 0xff] ^ (c >> 8);
+
+ return c;
+}
+
+void crc64_nvme_table(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ uint64_t c;
+
+ c = crc64_nvme_step(*crc ^ UINT64_MAX,
+ (const uint8_t *) buf, len);
+
+ *crc = c ^ UINT64_MAX;
+}
+
+#ifdef HAVE_PCLMUL
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+/*
+ * Fold-by-16 constants for reflected CRC-64/NVMe. Properties of the
+ * polynomial; identical between the PCLMUL and PMULL backends.
+ * k3 = bitrev64(x^(128+64) mod P) << 1
+ * k4 = bitrev64(x^(128+0) mod P) << 1
+ */
+static const uint64_t k3_clmul = 0xeadc41fd2ba3d420ULL;
+static const uint64_t k4_clmul = 0x21e9761e252621acULL;
+
+__attribute__((target("pclmul,sse4.1")))
+static __m128i fold16(__m128i x,
+ __m128i k)
+{
+ __m128i lo;
+ __m128i hi;
+
+ lo = _mm_clmulepi64_si128(x, k, 0x00);
+ hi = _mm_clmulepi64_si128(x, k, 0x11);
+ return _mm_xor_si128(lo, hi);
+}
+
+/*
+ * Fold-by-16 over 16-byte chunks; the 128-bit folded state is then
+ * emitted as 16 little-endian bytes and run through the byte-table
+ * loop together with any tail (<=15 bytes). The 16-byte minimum on
+ * the bulk loop is why the short-input path uses the table directly.
+ */
+__attribute__((target("pclmul,sse4.1")))
+static void crc64_nvme_clmul(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint64_t seed;
+ uint64_t c;
+ size_t off;
+ __m128i x;
+ __m128i k;
+ uint8_t post[16];
+
+ p = (const uint8_t *) buf;
+ seed = *crc;
+
+ if (len < 16) {
+ c = crc64_nvme_step(seed ^ UINT64_MAX, p, len);
+ *crc = c ^ UINT64_MAX;
+ return;
+ }
+
+ x = _mm_loadu_si128((const __m128i *) p);
+ x = _mm_xor_si128(x, _mm_cvtsi64_si128((int64_t)
+ (seed ^ UINT64_MAX)));
+
+ k = _mm_set_epi64x((int64_t) k4_clmul, (int64_t) k3_clmul);
+
+ off = 16;
+ while (off + 16 <= len) {
+ __m128i d;
+
+ d = _mm_loadu_si128((const __m128i *) (p + off));
+ x = _mm_xor_si128(fold16(x, k), d);
+ off += 16;
+ }
+
+ _mm_storeu_si128((__m128i *) post, x);
+
+ c = crc64_nvme_step(0, post, 16);
+ c = crc64_nvme_step(c, p + off, len - off);
+
+ *crc = c ^ UINT64_MAX;
+}
+
+#endif /* HAVE_PCLMUL */
+
+#ifdef HAVE_PMULL
+
+#include <arm_neon.h>
+
+/* Same fold-by-16 constants as the PCLMUL path (poly properties). */
+static const uint64_t k3_pmull = 0xeadc41fd2ba3d420ULL;
+static const uint64_t k4_pmull = 0x21e9761e252621acULL;
+
+__attribute__((target("+crypto")))
+static uint64x2_t fold16_pmull(uint64x2_t x,
+ uint64x2_t k)
+{
+ poly64x2_t xp;
+ poly64x2_t kp;
+ uint64x2_t lo;
+ uint64x2_t hi;
+
+ xp = vreinterpretq_p64_u64(x);
+ kp = vreinterpretq_p64_u64(k);
+ lo = vreinterpretq_u64_p128(
+ vmull_p64((poly64_t) vgetq_lane_u64(x, 0),
+ (poly64_t) vgetq_lane_u64(k, 0)));
+ hi = vreinterpretq_u64_p128(vmull_high_p64(xp, kp));
+ return veorq_u64(lo, hi);
+}
+
+__attribute__((target("+crypto")))
+static void crc64_nvme_pmull(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint64_t seed;
+ uint64_t c;
+ size_t off;
+ uint64x2_t x;
+ uint64x2_t k;
+ uint64_t seed_lane[2];
+ uint64_t k_lanes[2];
+ uint8_t post[16];
+
+ p = (const uint8_t *) buf;
+ seed = *crc;
+
+ if (len < 16) {
+ c = crc64_nvme_step(seed ^ UINT64_MAX, p, len);
+ *crc = c ^ UINT64_MAX;
+ return;
+ }
+
+ x = vld1q_u64((const uint64_t *) p);
+ seed_lane[0] = seed ^ UINT64_MAX;
+ seed_lane[1] = 0;
+ x = veorq_u64(x, vld1q_u64(seed_lane));
+
+ k_lanes[0] = k3_pmull;
+ k_lanes[1] = k4_pmull;
+ k = vld1q_u64(k_lanes);
+
+ off = 16;
+ while (off + 16 <= len) {
+ uint64x2_t d;
+
+ d = vld1q_u64((const uint64_t *) (p + off));
+ x = veorq_u64(fold16_pmull(x, k), d);
+ off += 16;
+ }
+
+ vst1q_u8(post, vreinterpretq_u8_u64(x));
+
+ c = crc64_nvme_step(0, post, 16);
+ c = crc64_nvme_step(c, p + off, len - off);
+
+ *crc = c ^ UINT64_MAX;
+}
+#endif /* HAVE_PMULL */
+
+void crc64_nvme(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+#ifdef HAVE_PCLMUL
+ crc64_nvme_clmul(crc, buf, len);
+#elif defined(HAVE_PMULL)
+ crc64_nvme_pmull(crc, buf, len);
+#else
+ crc64_nvme_table(crc, buf, len);
+#endif
+}
diff --git a/src/lib/crc/crc8.c b/src/lib/crc/crc8.c
new file mode 100644
index 00000000..20976b29
--- /dev/null
+++ b/src/lib/crc/crc8.c
@@ -0,0 +1,62 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 8-bit Cyclic Redundancy Check (AUTOSAR variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-8/AUTOSAR (reveng catalog):
+ * poly = 0x2f
+ * init = 0xff
+ * refin = false
+ * refout = false
+ * xorout = 0xff
+ * check = crc8_autosar("123456789") == 0xdf
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc8.h>
+
+
+ /* Bit-by-bit MSB-first CRC. */
+void crc8_autosar(uint8_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint8_t c;
+ size_t n;
+ int i;
+
+ p = (const uint8_t *) buf;
+ c = *crc ^ 0xff;
+
+ for (n = 0; n < len; n++) {
+ c ^= p[n];
+ for (i = 0; i < 8; i++) {
+ if (c & 0x80)
+ c = (uint8_t) ((c << 1) ^ 0x2f);
+ else
+ c = (uint8_t) (c << 1);
+ }
+ }
+
+ *crc = c ^ 0xff;
+}
diff --git a/src/lib/crc/tests/CMakeLists.txt b/src/lib/crc/tests/CMakeLists.txt
new file mode 100644
index 00000000..11daca5a
--- /dev/null
+++ b/src/lib/crc/tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+get_filename_component(PARENT_PATH ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_PATH} NAME)
+
+compute_test_prefix()
+
+create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
+ # Add new tests here
+ crc8_test.c
+ crc16_test.c
+ crc32_test.c
+ crc64_test.c
+ )
+
+add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests})
+
+disable_test_logging_for_target(${PARENT_DIR}_test)
+target_link_libraries(${PARENT_DIR}_test ouroboros-common)
+
+add_dependencies(build_tests ${PARENT_DIR}_test)
+
+ouroboros_register_tests(TARGET ${PARENT_DIR}_test TESTS ${${PARENT_DIR}_tests})
diff --git a/src/lib/crc/tests/crc16_test.c b/src/lib/crc/tests/crc16_test.c
new file mode 100644
index 00000000..03a5b504
--- /dev/null
+++ b/src/lib/crc/tests/crc16_test.c
@@ -0,0 +1,67 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-16/CCITT-FALSE function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc16.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* reveng-catalog smoke vectors. */
+static int test_crc16_ccitt_false_basic(void)
+{
+ uint16_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc16_ccitt_false(&crc, "", 0);
+ if (crc != 0xffff)
+ goto fail;
+
+ crc = 0;
+ crc16_ccitt_false(&crc, "123456789", 9);
+ if (crc != 0x29b1)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int crc16_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc16_ccitt_false_basic();
+ return ret;
+}
diff --git a/src/lib/tests/crc32_test.c b/src/lib/crc/tests/crc32_test.c
index 5a1ddd87..5a1ddd87 100644
--- a/src/lib/tests/crc32_test.c
+++ b/src/lib/crc/tests/crc32_test.c
diff --git a/src/lib/crc/tests/crc64_test.c b/src/lib/crc/tests/crc64_test.c
new file mode 100644
index 00000000..cf3f5ca3
--- /dev/null
+++ b/src/lib/crc/tests/crc64_test.c
@@ -0,0 +1,126 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-64/NVMe function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc64.h>
+#include <ouroboros/random.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* Reference impl, internal to libouroboros-common. */
+extern void crc64_nvme_table(uint64_t * crc,
+ const void * buf,
+ size_t len);
+
+/* reveng-catalog smoke vectors plus a 16-byte fold-boundary check. */
+static int test_crc64_nvme_basic(void)
+{
+ uint64_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc64_nvme(&crc, "", 0);
+ if (crc != 0x0000000000000000ULL)
+ goto fail;
+
+ crc = 0;
+ crc64_nvme(&crc, "123456789", 9);
+ if (crc != 0xae8b14860a799888ULL)
+ goto fail;
+
+ crc = 0;
+ crc64_nvme(&crc, "0123456789abcdef", 16);
+ if (crc != 0x091485ca7018730eULL)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL)
+/* Cross-check the accelerated dispatcher path against the byte-table. */
+static int test_crc64_nvme_random(void)
+{
+ static const size_t lens[] = {
+ 0, 1, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+ 129, 255, 256, 257, 1023, 1024, 1025, 4096
+ };
+ uint8_t buf[4096];
+ size_t i;
+ uint64_t ref;
+ uint64_t got;
+
+ TEST_START();
+
+ if (random_buffer(buf, sizeof(buf)) < 0) {
+ printf("Failed to generate random data.\n");
+ goto fail;
+ }
+
+ for (i = 0; i < sizeof(lens) / sizeof(lens[0]); i++) {
+ ref = 0;
+ crc64_nvme_table(&ref, buf, lens[i]);
+
+ got = 0;
+ crc64_nvme(&got, buf, lens[i]);
+
+ if (ref == got)
+ continue;
+
+ printf("Mismatch at len=%zu: table=0x%016lx disp=0x%016lx\n",
+ lens[i],
+ (unsigned long) ref,
+ (unsigned long) got);
+ goto fail;
+ }
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+#endif
+}
+
+int crc64_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc64_nvme_basic();
+#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL)
+ ret |= test_crc64_nvme_random();
+#endif
+ return ret;
+}
diff --git a/src/lib/crc/tests/crc8_test.c b/src/lib/crc/tests/crc8_test.c
new file mode 100644
index 00000000..f7bb33b8
--- /dev/null
+++ b/src/lib/crc/tests/crc8_test.c
@@ -0,0 +1,67 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-8/AUTOSAR function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc8.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* reveng-catalog smoke vectors. */
+static int test_crc8_autosar_basic(void)
+{
+ uint8_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc8_autosar(&crc, "", 0);
+ if (crc != 0x00)
+ goto fail;
+
+ crc = 0;
+ crc8_autosar(&crc, "123456789", 9);
+ if (crc != 0xdf)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int crc8_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc8_autosar_basic();
+ return ret;
+}
diff --git a/src/lib/crypt.c b/src/lib/crypt.c
index cd3421dd..71197f6e 100644
--- a/src/lib/crypt.c
+++ b/src/lib/crypt.c
@@ -1094,6 +1094,13 @@ void crypt_secure_malloc_fini(void)
#endif
}
+void crypt_cleanup(void)
+{
+#ifdef HAVE_OPENSSL
+ openssl_cleanup();
+#endif
+}
+
void * crypt_secure_malloc(size_t size)
{
#ifdef HAVE_OPENSSL
diff --git a/src/lib/crypt/openssl.c b/src/lib/crypt/openssl.c
index 573bc0b3..5916e3cb 100644
--- a/src/lib/crypt/openssl.c
+++ b/src/lib/crypt/openssl.c
@@ -629,7 +629,7 @@ ssize_t openssl_pkp_create(const char * algo,
return (ssize_t) raw.len;
} else { /* DER encode standard algorithms */
- pos = pk; /* i2d_PUBKEY increments the pointer, don't use pk! */
+ pos = pk; /* i2d_PUBKEY increments the ptr, don't use pk! */
len = i2d_PUBKEY(*pkp, &pos);
if (len < 0)
goto fail_pubkey;
@@ -666,7 +666,7 @@ static ssize_t __openssl_kem_encap(EVP_PKEY * pub,
/* Get required lengths */
ret = EVP_PKEY_encapsulate(ctx, NULL, &ct_len, NULL, &secret_len);
- if (ret != 1 || ct_len > MSGBUFSZ)
+ if (ret != 1 || ct_len > CRYPT_KEY_BUFSZ)
goto fail_encap;
/* Allocate buffer for secret */
@@ -1283,24 +1283,14 @@ int openssl_load_privkey_file(const char * path,
{
FILE * fp;
EVP_PKEY * pkey;
- unsigned long err;
- char errbuf[256];
fp = fopen(path, "r");
- if (fp == NULL) {
- fprintf(stderr, "Failed to open %s\n", path);
+ if (fp == NULL)
goto fail_file;
- }
pkey = PEM_read_PrivateKey(fp, NULL, NULL, "");
- if (pkey == NULL) {
- err = ERR_get_error();
- ERR_error_string_n(err, errbuf, sizeof(errbuf));
- fprintf(stderr,
- "OpenSSL error loading privkey from %s: %s\n",
- path, errbuf);
+ if (pkey == NULL)
goto fail_key;
- }
fclose(fp);
@@ -1442,7 +1432,7 @@ int openssl_load_pubkey_raw_file(const char * path,
buffer_t * buf)
{
FILE * fp;
- uint8_t tmp_buf[MSGBUFSZ];
+ uint8_t tmp_buf[CRYPT_KEY_BUFSZ];
size_t bytes_read;
const char * algo;
@@ -1453,7 +1443,7 @@ int openssl_load_pubkey_raw_file(const char * path,
if (fp == NULL)
goto fail_file;
- bytes_read = fread(tmp_buf, 1, MSGBUFSZ, fp);
+ bytes_read = fread(tmp_buf, 1, CRYPT_KEY_BUFSZ, fp);
if (bytes_read == 0)
goto fail_read;
@@ -1658,25 +1648,33 @@ int openssl_crt_str(const void * crt,
int openssl_crt_der(const void * crt,
buffer_t * buf)
{
- int len;
+ uint8_t * p;
+ int len;
assert(crt != NULL);
assert(buf != NULL);
- len = i2d_X509((X509 *) crt, &buf->data);
+ /* Get the size by encoding to NULL */
+ len = i2d_X509((X509 *) crt, NULL);
if (len < 0)
- goto fail_der;
+ goto fail_len;
+ buf->data = malloc((size_t) len);
+ if (buf->data == NULL)
+ goto fail_malloc;
+
+ p = buf->data; /* i2d_X509 increments p */
+ i2d_X509((X509 *) crt, &p);
buf->len = (size_t) len;
return 0;
- fail_der:
+ fail_malloc:
+ fail_len:
clrbuf(*buf);
return -1;
}
-
void * openssl_auth_create_store(void)
{
return X509_STORE_new();
@@ -1878,3 +1876,7 @@ void openssl_secure_clear(void * ptr,
{
OPENSSL_cleanse(ptr, size);
}
+void openssl_cleanup(void)
+{
+ OPENSSL_cleanup();
+}
diff --git a/src/lib/crypt/openssl.h b/src/lib/crypt/openssl.h
index b95d1b0b..af285232 100644
--- a/src/lib/crypt/openssl.h
+++ b/src/lib/crypt/openssl.h
@@ -169,4 +169,6 @@ void openssl_secure_free(void * ptr,
void openssl_secure_clear(void * ptr,
size_t size);
+void openssl_cleanup(void);
+
#endif /* OUROBOROS_LIB_CRYPT_OPENSSL_H */
diff --git a/src/lib/dev.c b/src/lib/dev.c
index 9cfc24ee..ae0401b7 100644
--- a/src/lib/dev.c
+++ b/src/lib/dev.c
@@ -29,10 +29,13 @@
#include "config.h"
#include "ssm.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/bitmap.h>
#include <ouroboros/cep.h>
+#include <ouroboros/crc16.h>
#include <ouroboros/crypt.h>
#include <ouroboros/dev.h>
+#include <ouroboros/endian.h>
#include <ouroboros/errno.h>
#include <ouroboros/fccntl.h>
#include <ouroboros/flow.h>
@@ -45,32 +48,33 @@
#include <ouroboros/np1_flow.h>
#include <ouroboros/pthread.h>
#include <ouroboros/random.h>
+#ifdef PROC_FLOW_STATS
+#include <ouroboros/rib.h>
+#endif
#include <ouroboros/serdes-irm.h>
+#include <ouroboros/sockets.h>
#include <ouroboros/ssm_flow_set.h>
#include <ouroboros/ssm_pool.h>
#include <ouroboros/ssm_rbuff.h>
-#include <ouroboros/sockets.h>
+#include <ouroboros/tw.h>
#include <ouroboros/utils.h>
-#ifdef PROC_FLOW_STATS
-#include <ouroboros/rib.h>
-#endif
+#include <assert.h>
#ifdef HAVE_LIBGCRYPT
#include <gcrypt.h>
#endif
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
#include <stdarg.h>
#include <stdbool.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include <sys/types.h>
#ifndef CLOCK_REALTIME_COARSE
#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
#endif
-/* Partial read information. */
#define NO_PART -1
#define DONE_PART -2
@@ -78,19 +82,12 @@
#define SECMEMSZ 16384
#define MSGBUFSZ 2048
-/* map flow_ids to flow descriptors; track state of the flow */
struct fmap {
int fd;
- /* TODO: use actual flow state */
enum flow_state state;
};
-#define frcti_to_flow(frcti) \
- ((struct flow *)((uint8_t *) frcti - offsetof(struct flow, frcti)))
-
struct flow {
- struct list_head next;
-
struct flow_info info;
struct ssm_rbuff * rx_rb;
@@ -135,16 +132,10 @@ struct {
struct flow * flows;
struct fmap * id_to_fd;
- struct list_head flow_list;
pthread_mutex_t mtx;
pthread_cond_t cond;
- pthread_t tx;
- pthread_t rx;
- size_t n_frcti;
- fset_t * frct_set;
-
pthread_rwlock_t lock;
} proc;
@@ -243,7 +234,7 @@ static int proc_announce(const struct proc_info * proc)
return irm__irm_result_des(&msg);
}
-/* IRMd will clean up the mess if this fails */
+/* IRMd cleans up on failure. */
static void proc_exit(void)
{
uint8_t buf[SOCK_BUF_SIZE];
@@ -264,7 +255,7 @@ static int spb_encrypt(struct flow * flow,
uint8_t * tail;
if (flow->crypt == NULL)
- return 0; /* No encryption */
+ return 0;
in.data = ssm_pk_buff_head(spb);
in.len = ssm_pk_buff_len(spb);
@@ -272,11 +263,11 @@ static int spb_encrypt(struct flow * flow,
if (crypt_encrypt(flow->crypt, in, &out) < 0)
goto fail_encrypt;
- head = ssm_pk_buff_head_alloc(spb, flow->headsz);
+ head = ssm_pk_buff_push(spb, flow->headsz);
if (head == NULL)
goto fail_alloc;
- tail = ssm_pk_buff_tail_alloc(spb, flow->tailsz);
+ tail = ssm_pk_buff_push_tail(spb, flow->tailsz);
if (tail == NULL)
goto fail_alloc;
@@ -299,7 +290,7 @@ static int spb_decrypt(struct flow * flow,
uint8_t * head;
if (flow->crypt == NULL)
- return 0; /* No decryption */
+ return 0;
in.data = ssm_pk_buff_head(spb);
in.len = ssm_pk_buff_len(spb);
@@ -308,8 +299,8 @@ static int spb_decrypt(struct flow * flow,
return -ENOMEM;
- head = ssm_pk_buff_head_release(spb, flow->headsz) + flow->headsz;
- ssm_pk_buff_tail_release(spb, flow->tailsz);
+ head = ssm_pk_buff_pop(spb, flow->headsz) + flow->headsz;
+ ssm_pk_buff_pop_tail(spb, flow->tailsz);
memcpy(head, out.data, out.len);
@@ -318,130 +309,280 @@ static int spb_decrypt(struct flow * flow,
return 0;
}
-#include "frct.c"
-
-void * flow_tx(void * o)
+/* tw_move under proc.lock rdlock; gates teardown vs in-flight fires. */
+static void tw_move_safe(void)
{
- struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
-
- (void) o;
+ pthread_rwlock_rdlock(&proc.lock);
- while (true) {
- timerwheel_move();
+ pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
- nanosleep(&tic, NULL);
- }
+ tw_move();
- return (void *) 0;
+ pthread_cleanup_pop(1);
}
-static void flow_send_keepalive(struct flow * flow,
- struct timespec now)
+static int crc_add(struct ssm_pk_buff * spb,
+ size_t head_skip)
{
- struct ssm_pk_buff * spb;
- ssize_t idx;
- uint8_t * ptr;
-
- idx = ssm_pool_alloc(proc.pool, 0, &ptr, &spb);
- if (idx < 0)
- return;
+ uint8_t * head;
+ uint8_t * tail;
- pthread_rwlock_wrlock(&proc.lock);
+ tail = ssm_pk_buff_push_tail(spb, CRCLEN);
+ if (tail == NULL)
+ return -ENOMEM;
- flow->snd_act = now;
+ head = ssm_pk_buff_head(spb) + head_skip;
- if (ssm_rbuff_write(flow->tx_rb, idx))
- ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ mem_hash(HASH_CRC32, tail, head, tail - head);
- pthread_rwlock_unlock(&proc.lock);
+ return 0;
}
-/* Needs rdlock on proc. */
-static void _flow_keepalive(struct flow * flow)
+static int crc_check(struct ssm_pk_buff * spb,
+ size_t head_skip)
{
- struct timespec now;
- struct timespec s_act;
- struct timespec r_act;
- int flow_id;
- time_t timeo;
- uint32_t acl;
+ uint32_t crc;
+ uint8_t * head = ssm_pk_buff_head(spb) + head_skip;
+ uint8_t * tail = ssm_pk_buff_pop_tail(spb, CRCLEN);
+
+ mem_hash(HASH_CRC32, &crc, head, tail - head);
- s_act = flow->snd_act;
- r_act = flow->rcv_act;
+ return !(crc == *((uint32_t *) tail));
+}
- flow_id = flow->info.id;
- timeo = flow->info.qs.timeout;
+/* FRCT included here so it can use proc and dev.c statics directly. */
+#include "frct.c"
- acl = ssm_rbuff_get_acl(flow->rx_rb);
- if (timeo == 0 || acl & (ACL_FLOWPEER | ACL_FLOWDOWN))
- return;
+/*
+ * SACK / DATA carry trailer CRC32; HCS protects the headers on every
+ * FRCT packet. Decrypt before any check so plaintext is authoritative.
+ */
+static bool invalid_pkt(struct flow * flow,
+ struct ssm_pk_buff * spb)
+{
+ const struct frct_pci * pci;
+ uint16_t flags;
+ size_t pci_total;
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (spb == NULL || ssm_pk_buff_len(spb) == 0)
+ return true;
- if (ts_diff_ns(&now, &r_act) > (int64_t) timeo * MILLION) {
- ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWPEER);
- ssm_flow_set_notify(proc.fqset, flow_id, FLOW_PEER);
- return;
+ if (spb_decrypt(flow, spb) < 0)
+ return true;
+
+ if (flow->frcti == NULL) {
+ if (flow->info.qs.ber == 0 && crc_check(spb, 0) != 0)
+ return true;
+ return false;
}
- if (ts_diff_ns(&now, &s_act) > (int64_t) timeo * (MILLION >> 2)) {
- pthread_rwlock_unlock(&proc.lock);
+ if (ssm_pk_buff_len(spb) < FRCT_PCILEN)
+ return true;
- flow_send_keepalive(flow, now);
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ flags = ntoh16(pci->flags);
- pthread_rwlock_rdlock(&proc.lock);
+ /* Untrusted flag read; mismatch on HCS will drop on corrupt. */
+ if (flags & FRCT_DATA)
+ pci_total = frcti_data_hdr_len(flow->frcti);
+ else
+ pci_total = frcti_ctrl_hdr_len(flow->frcti);
+
+ if (ssm_pk_buff_len(spb) < pci_total)
+ return true;
+
+ if (frct_hcs_check(pci, flow->frcti) != 0)
+ return true;
+
+ /* HCS valid: CRC32 on SACK; or on DATA if ber = 0. */
+ if (flags & FRCT_SACK) {
+ if (crc_check(spb, pci_total) != 0)
+ return true;
+
+ } else if ((flags & FRCT_DATA) && flow->info.qs.ber == 0) {
+ if (crc_check(spb, pci_total) != 0)
+ return true;
}
+
+ return false;
}
-static void handle_keepalives(void)
+static bool deadline_passed(const struct timespec * abs)
{
- struct list_head * p;
- struct list_head * h;
+ struct timespec now;
- pthread_rwlock_rdlock(&proc.lock);
+ if (abs == NULL)
+ return false;
- list_for_each_safe(p, h, &proc.flow_list) {
- struct flow * flow;
- flow = list_entry(p, struct flow, next);
- _flow_keepalive(flow);
- }
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
- pthread_rwlock_unlock(&proc.lock);
+ return ts_diff_ns(&now, abs) >= 0;
}
-static void __cleanup_fqueue_destroy(void * fq)
+/* Clamp the wait by min(dl, next tw expiry, now + TICTIME). */
+static void compute_wait_deadline(const struct timespec * dl,
+ struct timespec * out)
{
- fqueue_destroy((fqueue_t *) fq);
+ struct timespec now;
+ struct timespec cap;
+ struct timespec expiry;
+ struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &tic, &cap);
+
+ tw_next_expiry(&expiry);
+
+ *out = (ts_diff_ns(&cap, &expiry) < 0) ? expiry : cap;
+ if (dl != NULL && ts_diff_ns(out, dl) > 0)
+ *out = *dl;
}
-void * flow_rx(void * o)
+/*
+ * proc.lock rdlock held across each iteration so flow_fini's wrlock
+ * waits for us to finish; FLOWDOWN already set means we exit promptly.
+ */
+static void flow_drain_rx_nb(struct flow * flow)
{
- struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
- int ret;
- struct fqueue * fq;
+ ssize_t idx;
+ struct ssm_pk_buff * spb;
+ struct ssm_rbuff * rx_rb;
+ struct frcti * frcti;
+#ifdef PROC_FLOW_STATS
+ struct timespec t_a;
+ struct timespec t_b;
+#endif
+
+ if (flow->frcti != NULL)
+ STAT_BUMP(flow->frcti, drain_calls);
- (void) o;
+ while (true) {
+ pthread_rwlock_rdlock(&proc.lock);
- fq = fqueue_create();
+ rx_rb = flow->rx_rb;
+ if (rx_rb == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
- pthread_cleanup_push(__cleanup_fqueue_destroy, fq);
+ idx = ssm_rbuff_read(rx_rb);
+ if (idx < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
- /* fevent will filter all FRCT packets for us */
- while ((ret = fevent(proc.frct_set, fq, &tic)) != 0) {
- if (ret == -ETIMEDOUT) {
- handle_keepalives();
+ spb = ssm_pool_get(proc.pool, idx);
+ if (invalid_pkt(flow, spb)) {
+ ssm_pool_remove(proc.pool, idx);
+ pthread_rwlock_unlock(&proc.lock);
continue;
}
- while (fqueue_next(fq) >= 0)
- ; /* no need to act */
+ frcti = flow->frcti;
+ if (frcti != NULL) {
+#ifdef PROC_FLOW_STATS
+ clock_gettime(CLOCK_MONOTONIC, &t_a);
+ FRCTI_RCV(frcti, spb);
+ clock_gettime(CLOCK_MONOTONIC, &t_b);
+ STAT_ADD(frcti, rcv_proc_ns,
+ (size_t) ts_diff_ns(&t_b, &t_a));
+#else
+ FRCTI_RCV(frcti, spb);
+#endif
+ } else {
+ ssm_pool_remove(proc.pool, idx);
+ }
+
+ pthread_rwlock_unlock(&proc.lock);
+
+ /* Per-packet so the delayed-ACK fires on time in a burst. */
+#ifdef PROC_FLOW_STATS
+ clock_gettime(CLOCK_MONOTONIC, &t_a);
+ tw_move_safe();
+ clock_gettime(CLOCK_MONOTONIC, &t_b);
+ if (frcti != NULL)
+ STAT_ADD(frcti, tw_move_ns,
+ (size_t) ts_diff_ns(&t_b, &t_a));
+#else
+ tw_move_safe();
+#endif
}
+}
- pthread_cleanup_pop(true);
+/*
+ * Wait clamped by caller deadline, next tw expiry, and TICTIME;
+ * a clamp-timeout means tw work is due, not caller-deadline.
+ */
+static int flow_rx_one(struct flow * flow,
+ struct timespec * abs)
+{
+ struct timespec wait_abs;
+ struct ssm_pk_buff * spb;
+ struct ssm_rbuff * rx_rb;
+ ssize_t idx;
+
+ while (true) {
+ compute_wait_deadline(abs, &wait_abs);
+
+ /* rdlock gates flow_fini; FLOWDOWN preempts the block. */
+ pthread_rwlock_rdlock(&proc.lock);
+
+ rx_rb = flow->rx_rb;
+ if (rx_rb == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return -EFLOWDOWN;
+ }
+
+ idx = ssm_rbuff_read_b(rx_rb, &wait_abs);
+ if (idx == -ETIMEDOUT) {
+ pthread_rwlock_unlock(&proc.lock);
+ if (deadline_passed(abs))
+ return -ETIMEDOUT;
+ tw_move_safe();
+ continue;
+ }
+ if (idx < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ return idx;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (invalid_pkt(flow, spb)) {
+ ssm_pool_remove(proc.pool, idx);
+ pthread_rwlock_unlock(&proc.lock);
+ continue;
+ }
+
+ if (flow->frcti != NULL)
+ FRCTI_RCV(flow->frcti, spb);
+ else
+ ssm_pool_remove(proc.pool, idx);
+
+ pthread_rwlock_unlock(&proc.lock);
- return (void *) 0;
+ tw_move_safe();
+ return 0;
+ }
+}
+
+/* 0 = window open; -EAGAIN = !block and would block; else flow_rx_one rc. */
+static __inline__ int flow_wait_window(struct flow * flow,
+ size_t n,
+ bool block,
+ struct timespec * dl)
+{
+ int rc;
+
+ while (true) {
+ flow_drain_rx_nb(flow);
+ if (FRCTI_IS_WINDOW_OPEN_N(flow->frcti, n))
+ return 0;
+ if (!block)
+ return -EAGAIN;
+ rc = flow_rx_one(flow, dl);
+ if (rc < 0)
+ return rc;
+ }
}
static void flow_clear(int fd)
@@ -451,36 +592,40 @@ static void flow_clear(int fd)
proc.flows[fd].info.id = -1;
}
-static void __flow_fini(int fd)
+/*
+ * Set ACL_FLOWDOWN on rx/tx so any in-flight blocking reads or writes
+ * wake up and drop their proc.lock rdlock. Must run BEFORE flow_fini's
+ * wrlock, else the wrlock blocks on those rdlock holders and the
+ * in-flight calls never see the FLOWDOWN signal.
+ */
+static void flow_quiesce(int fd)
{
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ struct ssm_rbuff * rx_rb = proc.flows[fd].rx_rb;
+ struct ssm_rbuff * tx_rb = proc.flows[fd].tx_rb;
- if (proc.flows[fd].frcti != NULL) {
- proc.n_frcti--;
- if (proc.n_frcti == 0) {
- pthread_cancel(proc.tx);
- pthread_join(proc.tx, NULL);
- }
+ if (rx_rb != NULL)
+ ssm_rbuff_set_acl(rx_rb, ACL_FLOWDOWN);
+ if (tx_rb != NULL)
+ ssm_rbuff_set_acl(tx_rb, ACL_FLOWDOWN);
+}
- ssm_flow_set_del(proc.fqset, 0, proc.flows[fd].info.id);
+static void do_flow_fini(int fd)
+{
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
+ if (proc.flows[fd].frcti != NULL)
frcti_destroy(proc.flows[fd].frcti);
- }
if (proc.flows[fd].info.id != -1) {
flow_destroy(&proc.id_to_fd[proc.flows[fd].info.id]);
bmp_release(proc.fds, fd);
}
- if (proc.flows[fd].rx_rb != NULL) {
- ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN);
+ if (proc.flows[fd].rx_rb != NULL)
ssm_rbuff_close(proc.flows[fd].rx_rb);
- }
- if (proc.flows[fd].tx_rb != NULL) {
- ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN);
+ if (proc.flows[fd].tx_rb != NULL)
ssm_rbuff_close(proc.flows[fd].tx_rb);
- }
if (proc.flows[fd].set != NULL) {
ssm_flow_set_notify(proc.flows[fd].set,
@@ -491,24 +636,40 @@ static void __flow_fini(int fd)
crypt_destroy_ctx(proc.flows[fd].crypt);
- list_del(&proc.flows[fd].next);
-
flow_clear(fd);
}
static void flow_fini(int fd)
{
+ flow_quiesce(fd);
+
pthread_rwlock_wrlock(&proc.lock);
- __flow_fini(fd);
+ do_flow_fini(fd);
pthread_rwlock_unlock(&proc.lock);
}
#define IS_ENCRYPTED(crypt) ((crypt)->nid != NID_undef)
-#define IS_ORDERED(flow) (flow.qs.in_order != 0)
+#define IS_ORDERED(info) ((info)->qs.service != SVC_RAW)
+#define IS_STREAM(info) ((info)->qs.service == SVC_STREAM)
+
+/* Raw MTU minus the wrapping (IV/Tag + optional CRC) dev.c adds. */
+static __inline__ size_t flow_user_mtu(const struct flow * flow,
+ size_t raw)
+{
+ size_t hdr;
+
+ hdr = flow->headsz + flow->tailsz;
+ if (flow->info.qs.ber == 0 && flow->crypt == NULL)
+ hdr += CRCLEN;
+
+ return raw > hdr ? raw - hdr : 0;
+}
+
static int flow_init(struct flow_info * info,
- struct crypt_sk * sk)
+ struct crypt_sk * sk,
+ time_t rtt_hint)
{
struct timespec now;
struct flow * flow;
@@ -550,7 +711,6 @@ static int flow_init(struct flow_info * info,
flow->tailsz = 0;
if (IS_ENCRYPTED(sk)) {
- /* Set to lower value in tests, should we make configurable? */
sk->rot_bit = KEY_ROTATION_BIT;
flow->crypt = crypt_create_ctx(sk);
if (flow->crypt == NULL)
@@ -561,22 +721,16 @@ static int flow_init(struct flow_info * info,
assert(flow->frcti == NULL);
- if (IS_ORDERED(flow->info)) {
- flow->frcti = frcti_create(fd, DELT_A, DELT_R, info->mpl);
+ if (IS_ORDERED(&flow->info)) {
+ uint32_t frct_mtu = flow_user_mtu(flow, info->mtu);
+
+ flow->frcti = frcti_create(fd, DELT_A, DELT_R,
+ info->mpl, rtt_hint,
+ info->qs, frct_mtu);
if (flow->frcti == NULL)
goto fail_frcti;
-
- if (ssm_flow_set_add(proc.fqset, 0, info->id))
- goto fail_flow_set_add;
-
- ++proc.n_frcti;
- if (proc.n_frcti == 1 &&
- pthread_create(&proc.tx, NULL, flow_tx, NULL) < 0)
- goto fail_tx_thread;
}
- list_add_tail(&flow->next, &proc.flow_list);
-
proc.id_to_fd[info->id].fd = fd;
flow_set_state(&proc.id_to_fd[info->id], FLOW_ALLOCATED);
@@ -585,10 +739,6 @@ static int flow_init(struct flow_info * info,
return fd;
- fail_tx_thread:
- ssm_flow_set_del(proc.fqset, 0, info->id);
- fail_flow_set_add:
- frcti_destroy(flow->frcti);
fail_frcti:
crypt_destroy_ctx(flow->crypt);
fail_crypt:
@@ -655,13 +805,13 @@ static void init(int argc,
gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0);
}
#endif
- proc.fds = bmp_create(PROG_MAX_FLOWS - PROG_RES_FDS, PROG_RES_FDS);
+ proc.fds = bmp_create(PROC_MAX_FLOWS - PROC_RES_FDS, PROC_RES_FDS);
if (proc.fds == NULL) {
fprintf(stderr, "FATAL: Could not create fd bitmap.\n");
goto fail_fds;
}
- proc.fqueues = bmp_create(PROG_MAX_FQUEUES, 0);
+ proc.fqueues = bmp_create(PROC_MAX_FQUEUES, 0);
if (proc.fqueues == NULL) {
fprintf(stderr, "FATAL: Could not create fqueue bitmap.\n");
goto fail_fqueues;
@@ -677,13 +827,13 @@ static void init(int argc,
goto fail_rdrb;
}
- proc.flows = malloc(sizeof(*proc.flows) * PROG_MAX_FLOWS);
+ proc.flows = malloc(sizeof(*proc.flows) * PROC_MAX_FLOWS);
if (proc.flows == NULL) {
fprintf(stderr, "FATAL: Could not malloc flows.\n");
goto fail_flows;
}
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
flow_clear(i);
proc.id_to_fd = malloc(sizeof(*proc.id_to_fd) * SYS_MAX_FLOWS);
@@ -716,20 +866,14 @@ static void init(int argc,
goto fail_fqset;
}
- proc.frct_set = fset_create();
- if (proc.frct_set == NULL || proc.frct_set->idx != 0) {
- fprintf(stderr, "FATAL: Could not create FRCT set.\n");
- goto fail_frct_set;
- }
-
- if (timerwheel_init() < 0) {
+ if (tw_init() < 0) {
fprintf(stderr, "FATAL: Could not initialize timerwheel.\n");
goto fail_timerwheel;
}
if (crypt_secure_malloc_init(PROC_SECMEM_MAX) < 0) {
fprintf(stderr, "FATAL: Could not init secure malloc.\n");
- goto fail_timerwheel;
+ goto fail_secmem;
}
#if defined PROC_FLOW_STATS
@@ -741,24 +885,15 @@ static void init(int argc,
}
}
#endif
- if (pthread_create(&proc.rx, NULL, flow_rx, NULL) < 0) {
- fprintf(stderr, "FATAL: Could not start monitor thread.\n");
- goto fail_monitor;
- }
-
- list_head_init(&proc.flow_list);
-
return;
- fail_monitor:
#if defined PROC_FLOW_STATS
- rib_fini();
fail_rib_init:
+ crypt_secure_malloc_fini();
#endif
- timerwheel_fini();
+ fail_secmem:
+ tw_fini();
fail_timerwheel:
- fset_destroy(proc.frct_set);
- fail_frct_set:
ssm_flow_set_close(proc.fqset);
fail_fqset:
pthread_rwlock_destroy(&proc.lock);
@@ -789,19 +924,20 @@ static void fini(void)
if (proc.fds == NULL)
return;
- pthread_cancel(proc.rx);
- pthread_join(proc.rx, NULL);
+ /* Wake all in-flight readers/writers BEFORE wrlock acquire. */
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
+ if (proc.flows[i].info.id != -1)
+ flow_quiesce(i);
pthread_rwlock_wrlock(&proc.lock);
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
struct flow * flow = &proc.flows[i];
if (flow->info.id != -1) {
ssize_t idx;
- ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWDOWN);
while ((idx = ssm_rbuff_read(flow->rx_rb)) >= 0)
ssm_pool_remove(proc.pool, idx);
- __flow_fini(i);
+ do_flow_fini(i);
}
}
@@ -813,9 +949,9 @@ static void fini(void)
#ifdef PROC_FLOW_STATS
rib_fini();
#endif
- timerwheel_fini();
+ crypt_secure_malloc_fini();
- fset_destroy(proc.frct_set);
+ tw_fini();
ssm_flow_set_close(proc.fqset);
@@ -860,6 +996,10 @@ int flow_accept(qosspec_t * qs,
if (qs != NULL)
qs->ber = 1;
#endif
+ /* STREAM cannot tolerate loss: drops create silent gaps. */
+ if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0)
+ return -EINVAL;
+
memset(&flow, 0, sizeof(flow));
flow.n_pid = getpid();
@@ -878,7 +1018,8 @@ int flow_accept(qosspec_t * qs,
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ /* No RTT in accept; rtt_hint=0 bootstraps from first ACK. */
+ fd = flow_init(&flow, &crypt, 0);
crypt_secure_clear(key, SYMMKEYSZ);
@@ -899,11 +1040,16 @@ int flow_alloc(const char * dst,
uint8_t key[SYMMKEYSZ];
int fd;
int err;
+ struct timespec t0;
+ struct timespec t1;
#ifdef QOS_DISABLE_CRC
if (qs != NULL)
qs->ber = 1;
#endif
+ /* STREAM cannot tolerate loss: drops create silent gaps. */
+ if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0)
+ return -EINVAL;
memset(&flow, 0, sizeof(flow));
@@ -913,11 +1059,13 @@ int flow_alloc(const char * dst,
if (flow_alloc__irm_req_ser(&msg, &flow, dst, timeo))
return -ENOMEM;
+ clock_gettime(PTHREAD_COND_CLOCK, &t0);
+
err = send_recv_msg(&msg);
- if (err < 0) {
- printf("send_recv_msg error %d\n", err);
+ if (err < 0)
return err;
- }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &t1);
crypt.key = key;
@@ -925,7 +1073,7 @@ int flow_alloc(const char * dst,
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ fd = flow_init(&flow, &crypt, ts_diff_ns(&t1, &t0));
crypt_secure_clear(key, SYMMKEYSZ);
@@ -964,7 +1112,7 @@ int flow_join(const char * dst,
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ fd = flow_init(&flow, &crypt, 0);
crypt_secure_clear(key, SYMMKEYSZ);
@@ -983,10 +1131,10 @@ int flow_dealloc(int fd)
struct flow * flow;
int err;
- if (fd < 0 || fd >= SYS_MAX_FLOWS )
+ if (fd < 0 || fd >= PROC_MAX_FLOWS )
return -EINVAL;
- memset(&info, 0, sizeof(flow));
+ memset(&info, 0, sizeof(info));
flow = &proc.flows[fd];
@@ -1008,9 +1156,8 @@ int flow_dealloc(int fd)
pthread_rwlock_rdlock(&proc.lock);
- timeo.tv_sec = frcti_dealloc(flow->frcti);
- while (timeo.tv_sec < 0) { /* keep the flow active for rtx */
- ssize_t ret;
+ while (FRCTI_LINGERING(flow->frcti)) {
+ ssize_t ret;
pthread_rwlock_unlock(&proc.lock);
@@ -1018,12 +1165,12 @@ int flow_dealloc(int fd)
pthread_rwlock_rdlock(&proc.lock);
- timeo.tv_sec = frcti_dealloc(flow->frcti);
-
- if (ret == -EFLOWDOWN && timeo.tv_sec < 0)
- timeo.tv_sec = -timeo.tv_sec;
+ if (ret == -EFLOWDOWN)
+ break;
}
+ timeo.tv_sec = FRCTI_DEALLOC(flow->frcti);
+
pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
ssm_rbuff_fini(flow->tx_rb);
@@ -1033,15 +1180,18 @@ int flow_dealloc(int fd)
info.id = flow->info.id;
info.n_pid = getpid();
- if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0)
- return -ENOMEM;
+ if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
err = send_recv_msg(&msg);
if (err < 0)
- return err;
+ goto out;
err = irm__irm_result_des(&msg);
+ out:
flow_fini(fd);
return err;
@@ -1055,12 +1205,12 @@ int ipcp_flow_dealloc(int fd)
struct flow * flow;
int err;
- if (fd < 0 || fd >= SYS_MAX_FLOWS )
+ if (fd < 0 || fd >= PROC_MAX_FLOWS )
return -EINVAL;
flow = &proc.flows[fd];
- memset(&info, 0, sizeof(flow));
+ memset(&info, 0, sizeof(info));
pthread_rwlock_rdlock(&proc.lock);
@@ -1074,15 +1224,18 @@ int ipcp_flow_dealloc(int fd)
pthread_rwlock_unlock(&proc.lock);
- if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0)
- return -ENOMEM;
+ if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
err = send_recv_msg(&msg);
if (err < 0)
- return err;
+ goto out;
err = irm__irm_result_des(&msg);
+ out:
flow_fini(fd);
return err;
@@ -1102,8 +1255,18 @@ int fccntl(int fd,
uint32_t tx_acl;
size_t * qlen;
struct flow * flow;
-
- if (fd < 0 || fd >= SYS_MAX_FLOWS)
+ uint16_t old_acc;
+ uint16_t new_acc;
+ size_t max;
+ size_t * maxp;
+ size_t rsz;
+ size_t * rszp;
+ time_t rto;
+ time_t * rtop;
+ int rc;
+ bool emit_eos = false;
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
@@ -1167,14 +1330,27 @@ int fccntl(int fd,
qlen = va_arg(l, size_t *);
*qlen = ssm_rbuff_queued(flow->tx_rb);
break;
+ case FLOWGMTU:
+ maxp = va_arg(l, size_t *);
+ if (maxp == NULL)
+ goto einval;
+ *maxp = flow_user_mtu(flow, flow->info.mtu);
+ break;
case FLOWSFLAGS:
+ old_acc = flow->oflags & FLOWFACCMODE;
flow->oflags = va_arg(l, uint32_t);
+ new_acc = flow->oflags & FLOWFACCMODE;
+
+ /* Defer EOS emit until after proc.lock is dropped: */
+ /* frcti_fin_snd may block on shm-pool/tx-rb. */
+ if (new_acc == FLOWFRDONLY
+ && old_acc != FLOWFRDONLY
+ && flow->frcti != NULL)
+ emit_eos = true;
+
rx_acl = ssm_rbuff_get_acl(flow->rx_rb);
- tx_acl = ssm_rbuff_get_acl(flow->rx_rb);
- /*
- * Making our own flow write only means making the
- * the other side of the flow read only.
- */
+ tx_acl = ssm_rbuff_get_acl(flow->tx_rb);
+ /* Our flow write-only -> peer's read-only. */
if (flow->oflags & FLOWFWRONLY)
rx_acl |= ACL_RDONLY;
if (flow->oflags & FLOWFRDWR)
@@ -1218,6 +1394,59 @@ int fccntl(int fd,
goto eperm;
*cflags = frcti_getflags(flow->frcti);
break;
+ case FRCTSMAXSDU:
+ max = va_arg(l, size_t);
+ if (flow->frcti == NULL)
+ goto eperm;
+ if (frcti_set_max_rcv_sdu(flow->frcti, max) < 0)
+ goto einval;
+ break;
+ case FRCTGMAXSDU:
+ maxp = va_arg(l, size_t *);
+ if (maxp == NULL)
+ goto einval;
+ if (flow->frcti == NULL)
+ goto eperm;
+ *maxp = frcti_get_max_rcv_sdu(flow->frcti);
+ break;
+ case FRCTSRRINGSZ:
+ rsz = va_arg(l, size_t);
+ if (flow->frcti == NULL)
+ goto eperm;
+ rc = frcti_set_rcv_ring_sz(flow->frcti, rsz);
+ if (rc < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ va_end(l);
+ return rc;
+ }
+ break;
+ case FRCTGRRINGSZ:
+ rszp = va_arg(l, size_t *);
+ if (rszp == NULL)
+ goto einval;
+ if (flow->frcti == NULL)
+ goto eperm;
+ *rszp = frcti_get_rcv_ring_sz(flow->frcti);
+ break;
+ case FRCTSRTOMIN:
+ if (flow->frcti == NULL)
+ goto eperm;
+ rto = va_arg(l, time_t);
+ rc = frcti_set_rto_min(flow->frcti, rto);
+ if (rc < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ va_end(l);
+ return rc;
+ }
+ break;
+ case FRCTGRTOMIN:
+ if (flow->frcti == NULL)
+ goto eperm;
+ rtop = va_arg(l, time_t *);
+ if (rtop == NULL)
+ goto einval;
+ *rtop = frcti_get_rto_min(flow->frcti);
+ break;
default:
pthread_rwlock_unlock(&proc.lock);
va_end(l);
@@ -1227,6 +1456,9 @@ int fccntl(int fd,
pthread_rwlock_unlock(&proc.lock);
+ if (emit_eos)
+ frcti_fin_snd(flow->frcti);
+
va_end(l);
return 0;
@@ -1241,86 +1473,195 @@ int fccntl(int fd,
return -EPERM;
}
-static int chk_crc(struct ssm_pk_buff * spb)
-{
- uint32_t crc;
- uint8_t * head = ssm_pk_buff_head(spb);
- uint8_t * tail = ssm_pk_buff_tail_release(spb, CRCLEN);
-
- mem_hash(HASH_CRC32, &crc, head, tail - head);
-
- return !(crc == *((uint32_t *) tail));
-}
-
-static int add_crc(struct ssm_pk_buff * spb)
-{
- uint8_t * head;
- uint8_t * tail;
-
- tail = ssm_pk_buff_tail_alloc(spb, CRCLEN);
- if (tail == NULL)
- return -ENOMEM;
-
- head = ssm_pk_buff_head(spb);
- mem_hash(HASH_CRC32, tail, head, tail - head);
-
- return 0;
-}
-
static int flow_tx_spb(struct flow * flow,
struct ssm_pk_buff * spb,
+ uint16_t flags,
bool block,
struct timespec * abstime)
{
struct timespec now;
ssize_t idx;
+ size_t pci_total;
int ret;
clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_wrlock(&proc.lock);
-
flow->snd_act = now;
- pthread_rwlock_unlock(&proc.lock);
-
- idx = ssm_pk_buff_get_idx(spb);
-
- pthread_rwlock_rdlock(&proc.lock);
+ idx = ssm_pk_buff_get_off(spb);
if (ssm_pk_buff_len(spb) > 0) {
- if (frcti_snd(flow->frcti, spb) < 0)
+ if (FRCTI_SND(flow->frcti, spb, flags) < 0)
goto enomem;
- if (spb_encrypt(flow, spb) < 0)
- goto enomem;
+ if (flow->info.qs.ber == 0) {
+ pci_total = flow->frcti != NULL
+ ? frcti_data_hdr_len(flow->frcti) : 0;
+ if (crc_add(spb, pci_total) != 0)
+ goto enomem;
+ }
- if (flow->info.qs.ber == 0 && add_crc(spb) != 0)
+ if (spb_encrypt(flow, spb) < 0)
goto enomem;
}
- pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
-
if (!block)
ret = ssm_rbuff_write(flow->tx_rb, idx);
else
ret = ssm_rbuff_write_b(flow->tx_rb, idx, abstime);
- if (ret < 0)
+ if (ret < 0) {
ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
-
- pthread_cleanup_pop(true);
+ return ret;
+ }
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
return 0;
-enomem:
- pthread_rwlock_unlock(&proc.lock);
+ enomem:
ssm_pool_remove(proc.pool, idx);
return -ENOMEM;
}
+/* Per-fragment role for fragment i out of n; n == 1 yields SOLE. */
+static __inline__ uint16_t flow_frag_role(size_t i, size_t n)
+{
+ if (n == 1)
+ return FRCT_FR_SOLE;
+ if (i == 0)
+ return FRCT_FR_FIRST;
+ if (i + 1 == n)
+ return FRCT_FR_LAST;
+
+ return FRCT_FR_MID;
+}
+
+/*
+ * Stream-mode write: split buf into chunks of
+ * (frag_mtu - PCI - PCI_STREAM) bytes; each chunk goes through the
+ * normal tx path. frcti_snd injects the [start,end) extension and
+ * advances snd_byte_next under its wrlock. No FFGM/LFGM role bits.
+ */
+static ssize_t flow_write_stream(struct flow * flow,
+ const void * buf,
+ size_t count,
+ int oflags,
+ struct timespec * dl)
+{
+ const uint8_t * src = buf;
+ size_t payload;
+ size_t off = 0;
+ bool block = !(oflags & FLOWFWNOBLOCK);
+
+ if (!FRCTI_IS_FRTX(flow->frcti))
+ return -EMSGSIZE;
+
+ payload = FRCTI_PAYLOAD_CAP(flow->frcti);
+
+ while (off < count) {
+ struct ssm_pk_buff * spb;
+ uint8_t * ptr;
+ ssize_t idx;
+ size_t clen;
+ int ret;
+
+ ret = flow_wait_window(flow, 1, block, dl);
+ if (ret < 0)
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+
+ clen = MIN(count - off, payload);
+
+ if (block)
+ idx = ssm_pool_alloc_b(proc.pool, clen, &ptr,
+ &spb, dl);
+ else
+ idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb);
+ if (idx < 0)
+ return off > 0 ? (ssize_t) off : idx;
+
+ memcpy(ptr, src + off, clen);
+
+ ret = flow_tx_spb(flow, spb, 0, block, dl);
+ if (ret < 0)
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+
+ off += clen;
+ }
+
+ return (ssize_t) count;
+}
+
+/* Per-fragment flow_tx_spb loop. Raw flows refuse; FRCT splits the SDU. */
+static ssize_t flow_write_frag(struct flow * flow,
+ const void * buf,
+ size_t count,
+ int oflags,
+ struct timespec * dl)
+{
+ const uint8_t * src = buf;
+ size_t frag_payload;
+ size_t n;
+ size_t off = 0;
+ size_t i;
+ int ret;
+ bool block = !(oflags & FLOWFWNOBLOCK);
+
+ /* Raw flows carry no PCI; cannot fragment. */
+ if (flow->frcti == NULL)
+ return -EMSGSIZE;
+
+ frag_payload = FRCTI_PAYLOAD_CAP(flow->frcti);
+
+ /* Guard the ceil-divide against size_t overflow. */
+ if (count > SIZE_MAX - frag_payload + 1)
+ return -EMSGSIZE;
+ n = (count + frag_payload - 1) / frag_payload;
+
+ /* SDU larger than the FC window can ever offer would deadlock. */
+ if (n > RQ_SIZE)
+ return -EMSGSIZE;
+
+ /* SDU-atomic FC: wait for n seqnos to avoid overshoot mid-SDU. */
+ ret = flow_wait_window(flow, n, block, dl);
+ if (ret < 0)
+ return (ssize_t) ret;
+
+ STAT_BUMP(flow->frcti, sdu_snd_frag);
+
+ for (i = 0; i < n; ++i) {
+ struct ssm_pk_buff * spb;
+ uint8_t * ptr;
+ ssize_t idx;
+ size_t clen;
+
+ clen = (i + 1 == n) ? (count - off) : frag_payload;
+
+ if (block)
+ idx = ssm_pool_alloc_b(proc.pool, clen, &ptr,
+ &spb, dl);
+ else
+ idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb);
+ if (idx < 0) {
+ if (off > 0)
+ STAT_BUMP(flow->frcti, sdu_snd_alloc);
+ return off > 0 ? (ssize_t) off : idx;
+ }
+
+ memcpy(ptr, src + off, clen);
+
+ ret = flow_tx_spb(flow, spb, flow_frag_role(i, n),
+ block, dl);
+ if (ret < 0) {
+ if (off > 0)
+ STAT_BUMP(flow->frcti, sdu_snd_tx);
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+ }
+
+ off += clen;
+ }
+
+ return (ssize_t) count;
+}
+
ssize_t flow_write(int fd,
const void * buf,
size_t count)
@@ -1330,76 +1671,75 @@ ssize_t flow_write(int fd,
int ret;
int flags;
struct timespec abs;
- struct timespec * abstime = NULL;
+ struct timespec now;
+ struct timespec * dl = NULL;
struct ssm_pk_buff * spb;
uint8_t * ptr;
if (buf == NULL && count != 0)
return -EINVAL;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
- clock_gettime(PTHREAD_COND_CLOCK, &abs);
-
- pthread_rwlock_wrlock(&proc.lock);
+ pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
return -ENOTALLOC;
}
+ flags = flow->oflags;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
if (flow->snd_timesout) {
- ts_add(&abs, &flow->snd_timeo, &abs);
- abstime = &abs;
+ ts_add(&now, &flow->snd_timeo, &abs);
+ dl = &abs;
}
- flags = flow->oflags;
-
pthread_rwlock_unlock(&proc.lock);
if ((flags & FLOWFACCMODE) == FLOWFRDONLY)
return -EPERM;
- if (flags & FLOWFWNOBLOCK) {
- if (!frcti_is_window_open(flow->frcti))
- return -EAGAIN;
- idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb);
- } else {
- ret = frcti_window_wait(flow->frcti, abstime);
+ tw_move_safe();
+
+ if (flow->frcti != NULL) {
+ /* Pump rx_rb so a pure-writer processes ACKs. */
+ ret = flow_wait_window(flow, 1, !(flags & FLOWFWNOBLOCK), dl);
if (ret < 0)
return ret;
- idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, abstime);
+
+ if (count > 0 && FRCTI_IS_STREAM(flow->frcti))
+ return flow_write_stream(flow, buf, count, flags, dl);
+
+ if (FRCTI_NEEDS_FRAG(flow->frcti, count))
+ return flow_write_frag(flow, buf, count, flags, dl);
+ } else if (flow->info.mtu > 0
+ && count > flow_user_mtu(flow, flow->info.mtu)) {
+ /* Raw flows carry no PCI; refuse anything > one n-1 frame. */
+ return -EMSGSIZE;
}
+ if (flags & FLOWFWNOBLOCK)
+ idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb);
+ else
+ idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, dl);
if (idx < 0)
return idx;
if (count > 0)
memcpy(ptr, buf, count);
- ret = flow_tx_spb(flow, spb, !(flags & FLOWFWNOBLOCK), abstime);
+ ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE,
+ !(flags & FLOWFWNOBLOCK), dl);
return ret < 0 ? (ssize_t) ret : (ssize_t) count;
}
-static bool invalid_pkt(struct flow * flow,
- struct ssm_pk_buff * spb)
-{
- if (spb == NULL || ssm_pk_buff_len(spb) == 0)
- return true;
-
- if (flow->info.qs.ber == 0 && chk_crc(spb) != 0)
- return true;
-
- if (spb_decrypt(flow, spb) < 0)
- return true;
-
- return false;
-}
-
static ssize_t flow_rx_spb(struct flow * flow,
struct ssm_pk_buff ** spb,
bool block,
@@ -1408,19 +1748,14 @@ static ssize_t flow_rx_spb(struct flow * flow,
ssize_t idx;
struct timespec now;
- idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) :
- ssm_rbuff_read(flow->rx_rb);
+ idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime)
+ : ssm_rbuff_read(flow->rx_rb);
if (idx < 0)
return idx;
clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_wrlock(&proc.lock);
-
flow->rcv_act = now;
- pthread_rwlock_unlock(&proc.lock);
-
*spb = ssm_pool_get(proc.pool, idx);
if (invalid_pkt(flow, *spb)) {
@@ -1431,28 +1766,124 @@ static ssize_t flow_rx_spb(struct flow * flow,
return idx;
}
+static ssize_t raw_flow_read_pkt(struct flow * flow,
+ bool block,
+ struct timespec * dl)
+{
+ struct ssm_pk_buff * spb;
+ struct timespec wait_abs;
+ ssize_t idx;
+
+ while (true) {
+ if (!block) {
+ idx = ssm_rbuff_read(flow->rx_rb);
+ if (idx < 0)
+ return -EAGAIN;
+ } else {
+ compute_wait_deadline(dl, &wait_abs);
+ idx = ssm_rbuff_read_b(flow->rx_rb, &wait_abs);
+ if (idx == -ETIMEDOUT) {
+ if (deadline_passed(dl))
+ return -ETIMEDOUT;
+ continue;
+ }
+ if (idx < 0)
+ return idx;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (!invalid_pkt(flow, spb))
+ return idx;
+
+ ssm_pool_remove(proc.pool, idx);
+ if (!block)
+ return -EAGAIN;
+ }
+}
+
+static ssize_t deliver_pkt(struct flow * flow,
+ struct ssm_pk_buff * spb,
+ ssize_t idx,
+ void * buf,
+ size_t count,
+ bool partrd)
+{
+ uint8_t * packet = ssm_pk_buff_head(spb);
+ ssize_t n = ssm_pk_buff_len(spb);
+
+ assert(n >= 0);
+
+ if (n <= (ssize_t) count) {
+ memcpy(buf, packet, n);
+ ipcp_spb_release(spb);
+ if (partrd && n == (ssize_t) count)
+ flow->part_idx = DONE_PART;
+ else
+ flow->part_idx = NO_PART;
+
+ return n;
+ }
+
+ if (partrd) {
+ memcpy(buf, packet, count);
+ ssm_pk_buff_pop(spb, n);
+ flow->part_idx = idx;
+ return count;
+ }
+
+ ipcp_spb_release(spb);
+ return -EMSGSIZE;
+}
+
+/* Drive frcti_consume until it delivers or errors. */
+static ssize_t flow_read_frcti(struct flow * flow,
+ void * buf,
+ size_t count,
+ bool block,
+ struct timespec * dl)
+{
+ struct timespec now;
+ ssize_t bytes;
+ int rc;
+
+ while (true) {
+ flow_drain_rx_nb(flow);
+ bytes = FRCTI_CONSUME(flow->frcti, buf, count);
+ if (bytes >= 0)
+ break;
+ if (bytes != -EAGAIN)
+ return bytes;
+ if (!block)
+ return -EAGAIN;
+ rc = flow_rx_one(flow, dl);
+ if (rc < 0)
+ return rc;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ flow->rcv_act = now;
+
+ return bytes;
+}
+
ssize_t flow_read(int fd,
void * buf,
size_t count)
{
- ssize_t idx;
- ssize_t n;
- uint8_t * packet;
+ struct flow * flow;
struct ssm_pk_buff * spb;
struct timespec abs;
struct timespec now;
- struct timespec * abstime = NULL;
- struct flow * flow;
+ struct timespec * dl = NULL;
+ ssize_t idx;
bool block;
bool partrd;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
@@ -1461,8 +1892,8 @@ ssize_t flow_read(int fd,
}
if (flow->part_idx == DONE_PART) {
- pthread_rwlock_unlock(&proc.lock);
flow->part_idx = NO_PART;
+ pthread_rwlock_unlock(&proc.lock);
return 0;
}
@@ -1470,75 +1901,33 @@ ssize_t flow_read(int fd,
partrd = !(flow->oflags & FLOWFRNOPART);
if (flow->rcv_timesout) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
ts_add(&now, &flow->rcv_timeo, &abs);
- abstime = &abs;
+ dl = &abs;
}
- idx = flow->part_idx;
- if (idx < 0) {
- while ((idx = frcti_queued_pdu(flow->frcti)) < 0) {
- pthread_rwlock_unlock(&proc.lock);
-
- idx = flow_rx_spb(flow, &spb, block, abstime);
- if (idx < 0) {
- if (block && idx != -EAGAIN)
- return idx;
- if (!block)
- return idx;
+ pthread_rwlock_unlock(&proc.lock);
- pthread_rwlock_rdlock(&proc.lock);
- continue;
- }
+ tw_move_safe();
- pthread_rwlock_rdlock(&proc.lock);
+ idx = flow->part_idx;
+ if (idx < 0 && flow->frcti != NULL)
+ return flow_read_frcti(flow, buf, count, block, dl);
- frcti_rcv(flow->frcti, spb);
- }
+ if (idx < 0) {
+ idx = raw_flow_read_pkt(flow, block, dl);
+ if (idx < 0)
+ return idx;
}
spb = ssm_pool_get(proc.pool, idx);
- pthread_rwlock_unlock(&proc.lock);
-
- packet = ssm_pk_buff_head(spb);
-
- n = ssm_pk_buff_len(spb);
-
- assert(n >= 0);
-
- if (n <= (ssize_t) count) {
- memcpy(buf, packet, n);
- ipcp_spb_release(spb);
-
- pthread_rwlock_wrlock(&proc.lock);
-
- flow->part_idx = (partrd && n == (ssize_t) count) ?
- DONE_PART : NO_PART;
-
- flow->rcv_act = now;
-
- pthread_rwlock_unlock(&proc.lock);
- return n;
- } else {
- if (partrd) {
- memcpy(buf, packet, count);
- ssm_pk_buff_head_release(spb, n);
- pthread_rwlock_wrlock(&proc.lock);
- flow->part_idx = idx;
-
- flow->rcv_act = now;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ flow->rcv_act = now;
- pthread_rwlock_unlock(&proc.lock);
- return count;
- } else {
- ipcp_spb_release(spb);
- return -EMSGSIZE;
- }
- }
+ return deliver_pkt(flow, spb, idx, buf, count, partrd);
}
-/* fqueue functions. */
-
struct flow_set * fset_create(void)
{
struct flow_set * set;
@@ -1614,7 +2003,7 @@ int fset_add(struct flow_set * set,
struct flow * flow;
int ret;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return -EINVAL;
flow = &proc.flows[fd];
@@ -1650,7 +2039,7 @@ void fset_del(struct flow_set * set,
{
struct flow * flow;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return;
flow = &proc.flows[fd];
@@ -1661,7 +2050,7 @@ void fset_del(struct flow_set * set,
ssm_flow_set_del(proc.fqset, set->idx, flow->info.id);
if (flow->frcti != NULL)
- ssm_flow_set_add(proc.fqset, 0, proc.flows[fd].info.id);
+ ssm_flow_set_add(proc.fqset, 0, flow->info.id);
pthread_rwlock_unlock(&proc.lock);
}
@@ -1672,7 +2061,7 @@ bool fset_has(const struct flow_set * set,
struct flow * flow;
bool ret;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return false;
flow = &proc.flows[fd];
@@ -1691,61 +2080,59 @@ bool fset_has(const struct flow_set * set,
return ret;
}
-/* Filter fqueue events for non-data packets */
static int fqueue_filter(struct fqueue * fq)
{
struct ssm_pk_buff * spb;
int fd;
ssize_t idx;
struct frcti * frcti;
+ int ret = 0;
- while (fq->next < fq->fqsize) {
- if (fq->fqueue[fq->next].event != FLOW_PKT)
- return 1;
+ /* proc.lock rdlock gates frcti_destroy via flow_fini wrlock. */
+ pthread_rwlock_rdlock(&proc.lock);
- pthread_rwlock_rdlock(&proc.lock);
+ while (fq->next < fq->fqsize) {
+ if (fq->fqueue[fq->next].event != FLOW_PKT) {
+ ret = 1;
+ goto out;
+ }
fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd;
if (fd < 0) {
++fq->next;
- pthread_rwlock_unlock(&proc.lock);
continue;
}
frcti = proc.flows[fd].frcti;
if (frcti == NULL) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ ret = 1;
+ goto out;
}
- if (__frcti_pdu_ready(frcti) >= 0) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ if (FRCTI_PDU_READY(frcti)) {
+ ret = 1;
+ goto out;
}
- pthread_rwlock_unlock(&proc.lock);
-
idx = flow_rx_spb(&proc.flows[fd], &spb, false, NULL);
if (idx < 0)
- return 0;
-
- pthread_rwlock_rdlock(&proc.lock);
+ goto out;
spb = ssm_pool_get(proc.pool, idx);
- __frcti_rcv(frcti, spb);
+ FRCTI_RCV(frcti, spb);
- if (__frcti_pdu_ready(frcti) >= 0) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ if (FRCTI_PDU_READY(frcti)) {
+ ret = 1;
+ goto out;
}
- pthread_rwlock_unlock(&proc.lock);
-
++fq->next;
}
- return 0;
+ out:
+ pthread_rwlock_unlock(&proc.lock);
+ return ret;
}
int fqueue_next(struct fqueue * fq)
@@ -1792,7 +2179,8 @@ ssize_t fevent(struct flow_set * set,
{
ssize_t ret = 0;
struct timespec abs;
- struct timespec * t = NULL;
+ struct timespec * dl = NULL;
+ struct timespec wait_abs;
if (set == NULL || fq == NULL)
return -EINVAL;
@@ -1800,17 +2188,26 @@ ssize_t fevent(struct flow_set * set,
if (fq->fqsize > 0 && fq->next != fq->fqsize)
return 1;
- clock_gettime(PTHREAD_COND_CLOCK, &abs);
-
if (timeo != NULL) {
- ts_add(&abs, timeo, &abs);
- t = &abs;
+ struct timespec now;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, timeo, &abs);
+ dl = &abs;
}
while (ret == 0) {
- ret = ssm_flow_set_wait(proc.fqset, set->idx, fq->fqueue, t);
- if (ret == -ETIMEDOUT)
- return -ETIMEDOUT;
+ tw_move_safe();
+
+ compute_wait_deadline(dl, &wait_abs);
+
+ ret = ssm_flow_set_wait(proc.fqset, set->idx,
+ fq->fqueue, &wait_abs);
+ if (ret == -ETIMEDOUT) {
+ if (deadline_passed(dl))
+ return -ETIMEDOUT;
+ ret = 0;
+ continue;
+ }
fq->fqsize = ret;
fq->next = 0;
@@ -1823,8 +2220,6 @@ ssize_t fevent(struct flow_set * set,
return 1;
}
-/* ipcp-dev functions. */
-
int np1_flow_alloc(pid_t n_pid,
int flow_id)
{
@@ -1837,9 +2232,10 @@ int np1_flow_alloc(pid_t n_pid,
flow.n_pid = getpid();
flow.qs = qos_np1;
flow.mpl = 0;
- flow.n_1_pid = n_pid; /* This "flow" is upside-down! */
+ /* np1 flow: n_1_pid is the upper. */
+ flow.n_1_pid = n_pid;
- return flow_init(&flow, &crypt);
+ return flow_init(&flow, &crypt, 0);
}
int np1_flow_dealloc(int flow_id,
@@ -1847,12 +2243,7 @@ int np1_flow_dealloc(int flow_id,
{
int fd;
- /*
- * TODO: Don't pass timeo to the IPCP but wait in IRMd.
- * This will need async ops, waiting until we bootstrap
- * the IRMd over ouroboros.
- */
-
+ /* TODO: wait in IRMd, not here; needs async ops. */
sleep(timeo);
pthread_rwlock_rdlock(&proc.lock);
@@ -1900,6 +2291,7 @@ int ipcp_create_r(const struct ipcp_info * info)
int ipcp_flow_req_arr(const buffer_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct flow_info flow;
@@ -1916,6 +2308,7 @@ int ipcp_flow_req_arr(const buffer_t * dst,
flow.n_1_pid = getpid();
flow.qs = qs;
flow.mpl = mpl;
+ flow.mtu = mtu;
if (ipcp_flow_req_arr__irm_req_ser(&msg, dst, &flow, data) < 0)
return -ENOMEM;
@@ -1930,22 +2323,25 @@ int ipcp_flow_req_arr(const buffer_t * dst,
if (err < 0)
return err;
- assert(crypt.nid == NID_undef); /* np1 flows are not encrypted */
+ /* np1 flows are not encrypted. */
+ assert(crypt.nid == NID_undef);
- /* inverted for np1_flow */
+ /* Inverted for np1_flow. */
flow.n_1_pid = flow.n_pid;
flow.n_pid = getpid();
flow.mpl = 0;
+ flow.mtu = 0;
flow.qs = qos_np1;
crypt.nid = NID_undef;
- return flow_init(&flow, &crypt);
+ return flow_init(&flow, &crypt, 0);
}
int ipcp_flow_alloc_reply(int fd,
int response,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct flow_info flow;
@@ -1953,7 +2349,7 @@ int ipcp_flow_alloc_reply(int fd,
buffer_t msg = {SOCK_BUF_SIZE, buf};
int err;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
pthread_rwlock_rdlock(&proc.lock);
@@ -1962,6 +2358,7 @@ int ipcp_flow_alloc_reply(int fd,
pthread_rwlock_unlock(&proc.lock);
flow.mpl = mpl;
+ flow.mtu = mtu;
if (ipcp_flow_alloc_reply__irm_msg_ser(&msg, &flow, response, data) < 0)
return -ENOMEM;
@@ -1979,7 +2376,7 @@ int ipcp_flow_read(int fd,
struct flow * flow;
ssize_t idx = -1;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -1988,7 +2385,14 @@ int ipcp_flow_read(int fd,
assert(flow->info.id >= 0);
- while (frcti_queued_pdu(flow->frcti) < 0) {
+ /* Raw flow: deliver the popped pkt directly (no FRCT rq). */
+ if (flow->frcti == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ idx = flow_rx_spb(flow, spb, false, NULL);
+ return idx < 0 ? (int) idx : 0;
+ }
+
+ while (!FRCTI_PDU_READY(flow->frcti)) {
pthread_rwlock_unlock(&proc.lock);
idx = flow_rx_spb(flow, spb, false, NULL);
@@ -1997,7 +2401,7 @@ int ipcp_flow_read(int fd,
pthread_rwlock_rdlock(&proc.lock);
- frcti_rcv(flow->frcti, *spb);
+ FRCTI_RCV(flow->frcti, *spb);
}
pthread_rwlock_unlock(&proc.lock);
@@ -2011,12 +2415,12 @@ int ipcp_flow_write(int fd,
struct flow * flow;
int ret;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
- pthread_rwlock_wrlock(&proc.lock);
+ pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
@@ -2030,30 +2434,28 @@ int ipcp_flow_write(int fd,
pthread_rwlock_unlock(&proc.lock);
- ret = flow_tx_spb(flow, spb, true, NULL);
+ ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, true, NULL);
return ret;
}
-static int pool_copy_spb(struct ssm_pool * src_pool,
- ssize_t src_idx,
- struct ssm_pool * dst_pool,
- struct ssm_pk_buff ** dst_spb)
+/* Copy src into dst_pool without consuming src. Caller owns both halves. */
+static int pool_dup_spb(struct ssm_pool * src_pool,
+ size_t src_off,
+ struct ssm_pool * dst_pool,
+ struct ssm_pk_buff ** dst_spb)
{
struct ssm_pk_buff * src;
uint8_t * ptr;
size_t len;
- src = ssm_pool_get(src_pool, src_idx);
+ src = ssm_pool_get(src_pool, src_off);
len = ssm_pk_buff_len(src);
- if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) {
- ssm_pool_remove(src_pool, src_idx);
+ if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0)
return -ENOMEM;
- }
memcpy(ptr, ssm_pk_buff_head(src), len);
- ssm_pool_remove(src_pool, src_idx);
return 0;
}
@@ -2063,9 +2465,9 @@ int np1_flow_read(int fd,
struct ssm_pool * pool)
{
struct flow * flow;
- ssize_t idx = -1;
+ ssize_t off;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -2074,20 +2476,23 @@ int np1_flow_read(int fd,
pthread_rwlock_rdlock(&proc.lock);
- idx = ssm_rbuff_read(flow->rx_rb);
- if (idx < 0) {
+ off = ssm_rbuff_read(flow->rx_rb);
+ if (off < 0) {
pthread_rwlock_unlock(&proc.lock);
- return idx;
+ return off;
}
pthread_rwlock_unlock(&proc.lock);
if (pool == NULL) {
- *spb = ssm_pool_get(proc.pool, idx);
+ *spb = ssm_pool_get(proc.pool, off);
} else {
/* Cross-pool copy: PUP -> GSPP */
- if (pool_copy_spb(pool, idx, proc.pool, spb) < 0)
+ if (pool_dup_spb(pool, off, proc.pool, spb) < 0) {
+ ssm_pool_remove(pool, off);
return -ENOMEM;
+ }
+ ssm_pool_remove(pool, off);
}
return 0;
@@ -2100,9 +2505,10 @@ int np1_flow_write(int fd,
struct flow * flow;
struct ssm_pk_buff * dst;
int ret;
- ssize_t idx;
+ size_t off;
+ size_t dst_off;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -2121,45 +2527,47 @@ int np1_flow_write(int fd,
pthread_rwlock_unlock(&proc.lock);
- idx = ssm_pk_buff_get_idx(spb);
+ off = ssm_pk_buff_get_off(spb);
if (pool == NULL) {
- ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL);
+ ret = ssm_rbuff_write_b(flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ return ret;
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
} else {
- /* Cross-pool copy: GSPP -> PUP */
- if (pool_copy_spb(proc.pool, idx, pool, &dst) < 0)
+ /* Cross-pool copy: GSPP -> PUP. Src kept on error. */
+ if (pool_dup_spb(proc.pool, off, pool, &dst) < 0)
return -ENOMEM;
- idx = ssm_pk_buff_get_idx(dst);
- ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL);
- if (ret < 0)
- ssm_pool_remove(pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ dst_off = ssm_pk_buff_get_off(dst);
+ ret = ssm_rbuff_write_b(flow->tx_rb, dst_off, NULL);
+ if (ret < 0) {
+ ssm_pool_remove(pool, dst_off);
+ return ret;
+ }
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ ssm_pool_remove(proc.pool, off);
}
- return ret;
+ return 0;
}
int ipcp_spb_reserve(struct ssm_pk_buff ** spb,
size_t len)
{
- return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 ? -1 : 0;
+ return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0
+ ? -1 : 0;
}
void ipcp_spb_release(struct ssm_pk_buff * spb)
{
- ssm_pool_remove(proc.pool, ssm_pk_buff_get_idx(spb));
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
}
int ipcp_flow_fini(int fd)
{
struct ssm_rbuff * rx_rb;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
pthread_rwlock_rdlock(&proc.lock);
@@ -2188,7 +2596,7 @@ int ipcp_flow_fini(int fd)
int ipcp_flow_get_qoscube(int fd,
qoscube_t * cube)
{
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(cube);
pthread_rwlock_rdlock(&proc.lock);
@@ -2227,7 +2635,7 @@ int local_flow_transfer(int src_fd,
struct ssm_pk_buff * dst_spb;
struct ssm_pool * sp;
struct ssm_pool * dp;
- ssize_t idx;
+ ssize_t off;
int ret;
assert(src_fd >= 0);
@@ -2241,15 +2649,15 @@ int local_flow_transfer(int src_fd,
pthread_rwlock_rdlock(&proc.lock);
- idx = ssm_rbuff_read(src_flow->rx_rb);
- if (idx < 0) {
+ off = ssm_rbuff_read(src_flow->rx_rb);
+ if (off < 0) {
pthread_rwlock_unlock(&proc.lock);
- return idx;
+ return off;
}
if (dst_flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
- ssm_pool_remove(sp, idx);
+ ssm_pool_remove(sp, off);
return -ENOTALLOC;
}
@@ -2257,21 +2665,24 @@ int local_flow_transfer(int src_fd,
if (sp == dp) {
/* Same pool: zero-copy */
- ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL);
+ ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(sp, idx);
+ ssm_pool_remove(sp, off);
else
ssm_flow_set_notify(dst_flow->set,
dst_flow->info.id, FLOW_PKT);
} else {
/* Different pools: single copy */
- if (pool_copy_spb(sp, idx, dp, &dst_spb) < 0)
+ if (pool_dup_spb(sp, off, dp, &dst_spb) < 0) {
+ ssm_pool_remove(sp, off);
return -ENOMEM;
+ }
- idx = ssm_pk_buff_get_idx(dst_spb);
- ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL);
+ ssm_pool_remove(sp, off);
+ off = ssm_pk_buff_get_off(dst_spb);
+ ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(dp, idx);
+ ssm_pool_remove(dp, off);
else
ssm_flow_set_notify(dst_flow->set,
dst_flow->info.id, FLOW_PKT);
diff --git a/src/lib/frct.c b/src/lib/frct.c
index fad2cf69..2e8955e3 100644
--- a/src/lib/frct.c
+++ b/src/lib/frct.c
@@ -1,7 +1,7 @@
/*
* Ouroboros - Copyright (C) 2016 - 2026
*
- * Flow and Retransmission Control
+ * Flow and Retransmission Control Task (FRCT)
*
* Dimitri Staessens <dimitri@ouroboros.rocks>
* Sander Vrijders <sander@ouroboros.rocks>
@@ -20,97 +20,416 @@
* Foundation, Inc., http://www.fsf.org/about/contact/.
*/
-#include <ouroboros/endian.h>
+/* Included by dev.c; uses dev.c statics (proc, spb_encrypt, ...). */
#define DELT_RDV (100 * MILLION) /* ns */
-#define MAX_RDV (1 * BILLION) /* ns */
+#define MAX_RDV (1 * BILLION) /* ns */
+
+#define MAX_RTO_MUL 8 /* caps the RTO backoff shift */
+#define MAX_TLP_PER_EP 2 /* RFC 8985 §7.3: up to 2 TLPs */
+#define INITIAL_RTO (1 * BILLION) /* RFC 6298 §2.1: 1 s default */
+#define RTT_BOOT_NS (10 * MILLION) /* rtt_hint floor + initial mdev */
+#define SRTT_FLOOR_NS 1000L /* 1 us; smoothed RTT floor */
+#define MDEV_FLOOR_NS 100L /* 100 ns; mdev sanity floor */
+#define RTT_CLAMP_MUL 16 /* probe sample cap = N * srtt */
+#define MIN_RTT_WIN_NS (300ULL * BILLION) /* 5 min, Linux tcp default */
+#define NACK_COOLDOWN_NS (100 * MILLION) /* pre-DRF NACK cooldown */
+#define FRCT_TX_TIMEO_NS (250 * 1000) /* tx ring write deadline */
+#define ACK_DELAY_NS (2ULL * TICTIME) /* delayed-ACK fire delay */
#define FRCT "frct"
#define FRCT_PCILEN (sizeof(struct frct_pci))
#define FRCT_NAME_STRLEN 32
-struct frct_cr {
- uint32_t lwe; /* Left window edge */
- uint32_t rwe; /* Right window edge */
+/* Wire-protocol cap on SACK blocks per packet; binds both peers. */
+#define SACK_MAX_BLOCKS 2048
+#define SACK_BLOCK_SIZE (2 * sizeof(uint32_t))
+/* 2B count + 2B pad to 4-byte align the block list. */
+#define SACK_HDR_SIZE (sizeof(uint32_t))
+#define SACK_MIN_GAP_NS (250u * 1000u) /* 250 us SACK gap */
+#define MIN_REORDER_NS (250u * 1000u) /* 250 us RACK floor */
+#define SACK_RXM_MAX 32 /* Cap on retransmits staged from single SACK.*/
+#define DUP_THRESH 3 /* RFC 8985 §6.2 step 2.2 SACK count gate. */
+
+/* RFC 8985 §7.2 RACK reorder-window scaling cap. */
+#define REO_WND_MULT_MAX 20
+/* RFC 8985 §7.2 step 5: round trips of no DSACK before halving. */
+#define REO_DECAY_PKTS 16
+/* DSACK seqno sanity: reject reports older/farther than one rcv window. */
+#define MAX_DSACK_LAG RQ_SIZE
+
+/* Signed ns elapsed; negative under concurrent update (no underflow). */
+static __inline__ int64_t ts_age_ns(uint64_t now_ns,
+ uint64_t then_ns)
+{
+ return (int64_t)(now_ns - then_ns);
+}
- uint8_t cflags;
- uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */
+/* True iff strictly more than thr_ns elapsed since then_ns. */
+static __inline__ bool ts_aged_ns(uint64_t now_ns,
+ uint64_t then_ns,
+ uint64_t thr_ns)
+{
+ return ts_age_ns(now_ns, then_ns) > (int64_t) thr_ns;
+}
- struct timespec act; /* Last seen activity */
- time_t inact; /* Inactivity (s) */
-};
+/* FRCT r-timer: do not retransmit packet older than t_r (from first send). */
+#define RXM_AGED_OUT(t0, now_ns, t_r) \
+ ts_aged_ns((now_ns), (t0), (uint64_t)(t_r))
-struct frcti {
- int fd;
+/* FRCT a-timer: do not (re)transmit ACK after t_a from last data receive. */
+#define ACK_AGED_OUT(act, now_ns, t_a) \
+ ts_aged_ns((now_ns), (act), (uint64_t)(t_a))
- time_t mpl;
- time_t a;
- time_t r;
- time_t rdv;
-
- time_t srtt; /* Smoothed rtt */
- time_t mdev; /* Deviation */
- time_t rto; /* Retransmission timeout */
- uint32_t rttseq;
- struct timespec t_probe; /* Probe time */
- bool probe; /* Probe active */
-#ifdef PROC_FLOW_STATS
- size_t n_rtx; /* Number of rxm packets */
- size_t n_prb; /* Number of rtt probes */
- size_t n_rtt; /* Number of estimates */
- size_t n_dup; /* Duplicates received */
- size_t n_dak; /* Delayed ACKs received */
- size_t n_rdv; /* Number of rdv packets */
- size_t n_out; /* Packets out of window */
- size_t n_rqo; /* Packets out of rqueue */
-#endif
- struct frct_cr snd_cr;
- struct frct_cr rcv_cr;
+struct sack_args {
+ uint16_t n;
+ bool dsack; /* RFC 2883: block[0] is a DSACK report */
+ uint32_t ack;
+ uint32_t rwe;
+ uint32_t blocks[][2]; /* flexible — sized at alloc time */
+};
+/* NewReno-careful (RFC 6582) exit pad; gates RTT samples post-signal. */
+#define RTT_QUARANTINE 32
+#define RTTP_NONCE_LEN 16
- ssize_t rq[RQ_SIZE];
- pthread_rwlock_t lock;
+/* RTT-probe wire payload (after the FRCT PCI). */
+struct frct_rttp {
+ uint32_t probe_id; /* sender counter; 0 on reply */
+ uint32_t echo_id; /* peer's probe_id; 0 outbound */
+ uint8_t nonce[RTTP_NONCE_LEN]; /* random; echoed verbatim */
+} __attribute__((packed));
- bool open; /* Window open/closed */
- struct timespec t_wnd; /* Window closed time */
- struct timespec t_rdvs; /* Last rendez-vous sent */
- pthread_cond_t cond;
- pthread_mutex_t mtx;
-};
+#define RTTP_PAYLOAD sizeof(struct frct_rttp)
+#define RTTP_POS(id) ((id) & (RTTP_RING - 1))
+/*
+ * Flag values are assigned MSB-first on the wire (RFC convention):
+ * bit 0 = 0x8000 occupies wire-position 0 of the 16-bit flags
+ * field, bit 12 = 0x0008 is the last assigned bit, and the three
+ * LSBs (0x0007) are reserved.
+ */
enum frct_flags {
- FRCT_DATA = 0x01, /* PDU carries data */
- FRCT_DRF = 0x02, /* Data run flag */
- FRCT_ACK = 0x04, /* ACK field valid */
- FRCT_FC = 0x08, /* FC window valid */
- FRCT_RDVS = 0x10, /* Rendez-vous */
- FRCT_FFGM = 0x20, /* First Fragment */
- FRCT_MFGM = 0x40, /* More fragments */
+ FRCT_DATA = 0x8000, /* PDU carries data */
+ FRCT_DRF = 0x4000, /* Data run flag */
+ FRCT_ACK = 0x2000, /* ACK field valid */
+ FRCT_NACK = 0x1000, /* Neg-ACK: pci->seqno is arrival_seqno - 1 */
+ FRCT_FC = 0x0800, /* FC window valid */
+ FRCT_RDVS = 0x0400, /* Rendez-vous */
+ FRCT_FFGM = 0x0200, /* First fragment (begin) */
+ FRCT_LFGM = 0x0100, /* Last fragment (end) */
+ FRCT_RXM = 0x0080, /* Retransmission */
+ FRCT_SACK = 0x0040, /* SACK block list follows */
+ FRCT_RTTP = 0x0020, /* RTT probe / echo */
+ FRCT_KA = 0x0010, /* Keepalive */
+ FRCT_FIN = 0x0008, /* End of stream */
};
-struct frct_pci {
- uint8_t flags;
+/*
+ * DATA-packet fragment role (FFGM = begin, LFGM = end), SCTP-style:
+ * 1 1 = sole / un-fragmented SDU (begin AND end)
+ * 1 0 = first fragment of a multi-fragment SDU
+ * 0 0 = middle fragment
+ * 0 1 = last fragment
+ */
+#define FRCT_FR_MASK (FRCT_FFGM | FRCT_LFGM)
+#define FRCT_FR_SOLE (FRCT_FFGM | FRCT_LFGM)
+#define FRCT_FR_FIRST (FRCT_FFGM)
+#define FRCT_FR_MID (0)
+#define FRCT_FR_LAST (FRCT_LFGM)
+
+/* Default cap on a single reassembled SDU. App can raise via FRCTSMAXSDU */
+#define FRCT_MAX_SDU (1U << 20)
+
+/* Stream-mode PCI extension: [start, end) byte range on every DATA pkt. */
+struct frct_pci_stream {
+ uint32_t start;
+ uint32_t end;
+} __attribute__((packed));
+
+#define FRCT_PCI_STREAM_LEN (sizeof(struct frct_pci_stream))
- uint8_t pad; /* 24 bit window! */
- uint16_t window;
+/* Bytes following PCI: SACK list / RTTP nonce / control payload. */
+#define FRCT_BODY(pci) ((uint8_t *) (pci) + FRCT_PCILEN)
+/* Typed access to the stream PCI extension on stream DATA packets. */
+#define FRCT_SPCI(pci) \
+ ((struct frct_pci_stream *) ((uint8_t *) (pci) + FRCT_PCILEN))
+/* Push the FRCT header onto spb's head. */
+#define FRCT_HDR_PUSH(spb, frcti) \
+ ((struct frct_pci *) ssm_pk_buff_push((spb), \
+ frcti_data_hdr_len(frcti)))
+
+/* Pop a fixed-size header off spb's head; cast to type *. */
+#define FRCT_HDR_POP(spb, type) \
+ ((struct type *) ssm_pk_buff_pop((spb), sizeof(struct type)))
+
+/* Default / max per-flow stream rx ring (pow2); min N * per_pkt. */
+#define FRCT_STREAM_RING_MIN_PKTS 4
+#define FRCT_STREAM_RING_SZ (1U << 20) /* 1 MiB default */
+#define FRCT_STREAM_RING_SZ_MAX (1U << 27) /* 128 MiB */
+
+struct frct_pci {
+ uint16_t flags;
+ uint16_t hcs;
+
+ uint32_t window;
uint32_t seqno;
uint32_t ackno;
} __attribute__((packed));
+/* Stat counters; fold to no-ops without PROC_FLOW_STATS. */
#ifdef PROC_FLOW_STATS
+struct frcti_stat {
+ size_t rxm_rto; /* RTO-timer driven retransmits */
+ size_t rxm_rcv; /* RXM packets received (all) */
+ size_t rxm_dup_rcv; /* RXM dups (peer already had it) */
+ size_t rxm_sack; /* SACK-mechanism retransmits */
+ size_t rxm_rack; /* RACK-driven retransmits */
+ size_t rxm_dupthresh; /* DupThresh-driven retransmits */
+ size_t rxm_nack; /* NACK-pulled retransmits */
+ size_t rxm_due_count; /* rxm_due entries (pre-bail) */
+ size_t rxm_due_acked; /* bail: seqno < snd_lwe */
+ size_t rxm_due_unowned; /* bail: slot.rxm replaced */
+ size_t rxm_due_aged; /* bail: r->t0 + t_r < now */
+ size_t rxm_due_defer; /* bail: non-HoL, deferred to HoL */
+ size_t rxm_arm_fail; /* rxm_arm: malloc failed */
+ size_t rxm_cancel; /* entries cancelled at teardown */
+ size_t rxm_tx_dead; /* RXM tx into terminal flow */
+ size_t tx_drop; /* frct_tx fail (any cause) */
+ size_t tx_drop_ack; /* bare ACK dropped */
+ size_t tx_drop_sack; /* SACK dropped */
+ size_t tx_drop_ka; /* keepalive dropped */
+ size_t tx_drop_rttp; /* RTT probe/echo dropped */
+ size_t tx_drop_nack; /* pre-DRF NACK dropped */
+ size_t tx_drop_rdv; /* rendez-vous dropped */
+ size_t tx_drop_other; /* anything not matched above */
+ size_t ack_snd; /* ACK packets sent (bare + SACK) */
+ size_t ack_fire; /* delayed-ACK timer fires */
+ size_t ack_supp_seqno; /* fire suppressed: seqno */
+ size_t ack_supp_inact; /* fire suppressed: inact */
+ size_t ack_supp_rate; /* fire suppressed: rate */
+ size_t ack_rcv; /* ACK packets received */
+ size_t ack_rtt; /* ACKs that fed RTT estimator */
+ size_t ack_dup_rcv; /* ACK packet wire dups dropped */
+ size_t dup_rcv; /* duplicates received */
+ size_t out_rcv; /* pkts out of window */
+ size_t rqo_rcv; /* pkts out of rqueue */
+ size_t ooo_rcv; /* OOO arrivals */
+ size_t sack_snd; /* SACK packets sent */
+ size_t sack_rcv; /* SACK packets received */
+ size_t dsack_snd; /* SACK pkts carrying a DSACK */
+ size_t dsack_rcv; /* DSACK blocks parsed */
+ size_t dsack_drop; /* DSACK blocks past MAX_DSACK_LAG */
+ size_t nack_snd; /* pre-DRF NACKs sent */
+ size_t nack_rcv; /* pre-DRF NACKs received */
+ size_t tlp_snd; /* tail loss probes sent */
+ size_t inact_drop; /* inactivity drop (NACK on cd) */
+ size_t drf_rebase; /* DRF-triggered window rebase */
+ size_t rq_released; /* slots cleared by release_rq */
+ size_t rttp_snd; /* RTT probes sent */
+ size_t rttp_rcv; /* RTT probe replies rcvd */
+ size_t rtt_smpl; /* RTT estimator samples */
+ size_t rdv_snd; /* rendez-vous packets sent */
+ size_t rdv_rcv; /* rendez-vous packets rcvd */
+ size_t ka_snd; /* keepalives sent */
+ size_t ka_rcv; /* keepalives received */
+ size_t sdu_snd_frag; /* writes that fragmented */
+ size_t sdu_snd_alloc; /* alloc fail truncated SDU send */
+ size_t sdu_snd_tx; /* tx fail truncated SDU send */
+ size_t frag_snd; /* fragments sent: FIRST/MID/LAST */
+ size_t frag_rcv; /* fragments stashed in rq[] */
+ size_t sdu_reasm; /* SDUs delivered reassembled */
+ size_t sdu_sole; /* SOLE SDUs delivered (n==1) */
+ size_t frag_drop; /* dropped at malformed run */
+ size_t strm_snd_byte; /* bytes sent on stream */
+ size_t strm_rcv_byte; /* bytes copied to ring */
+ size_t strm_dlv_byte; /* bytes delivered to reader */
+ size_t strm_drop; /* stream rcvs dropped */
+ size_t strm_fin_drop; /* stream FIN packets rejected */
+ /* Profiling instrumentation. */
+ size_t rcv_proc_ns; /* time inside FRCTI_RCV (ns) */
+ size_t tw_move_ns; /* time inside tw_move (ns) */
+ size_t drain_calls; /* flow_drain_rx_nb invocations */
+};
+
+#define STAT_BUMP(frcti, field) FETCH_ADD_RELAXED(&(frcti)->stat.field, 1)
+#define STAT_ADD(frcti, field, v) FETCH_ADD_RELAXED(&(frcti)->stat.field, (v))
+#define STAT_LOAD(frcti, field) LOAD_RELAXED(&(frcti)->stat.field)
+#else
+#define STAT_BUMP(frcti, field) ((void) (frcti))
+#define STAT_ADD(frcti, field, v) ((void) (frcti))
+#define STAT_LOAD(frcti, field) ((void) (frcti), (size_t) 0)
+#endif
+
+#define frcti_to_flow(f) (&proc.flows[(f)->fd])
+#define RTTP_RING 8
+#define RTTP_COLD_NS (100 * MILLION) /* cold-probe cadence */
+#define RQ_SLOT(seqno) ((seqno) & (RQ_SIZE - 1))
+
+struct rxm_entry;
+
+enum snd_slot_flags {
+ SND_RTX = 0x01, /* Any retransmit; Karn skips next RTT sample. */
+ SND_FAST_RXM = 0x02, /* Fast-retx one-shot gate per loss event. */
+ SND_TLP = 0x04, /* Tail loss probe; ACK resets rto_mul. */
+};
+
+struct snd_slot {
+ struct rxm_entry * rxm; /* RXM entry, NULL if none. */
+ uint64_t time; /* ts_to_ns of last send (any kind). */
+ uint8_t flags; /* SND_* bits above. */
+};
+
+/* Per-seqno reorder slot (FRTX) and stream-mode byte/FIN metadata. */
+struct rcv_slot {
+ ssize_t idx; /* spb idx; -1 = empty */
+ uint32_t start; /* stream byte start */
+ uint32_t end; /* stream byte end */
+ uint8_t fin; /* stream FIN bit */
+};
+
+struct frct_cr {
+ uint32_t lwe; /* Left window edge */
+ uint32_t rwe; /* Right window edge */
+
+ uint8_t cflags;
+ uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */
+ uint32_t ackno; /* snd: ACK-pkt seqno; rcv: dedup */
+
+ uint64_t act; /* ts_to_ns of last activity */
+ uint64_t inact; /* Inactivity threshold (ns) */
+};
+
+struct frcti {
+ /* IMM: set once in frcti_create; read-only thereafter. */
+ int fd;
+ uint64_t t_mpl; /* MPL (ns) */
+ uint64_t t_a; /* a-timer (ns) */
+ uint64_t t_r; /* r-timer (ns) */
+ uint64_t t_rdv; /* RDV cooldown (ns) */
+ time_t ber; /* cached qs.ber */
+ bool lossy; /* qs.loss != 0 */
+ time_t qs_timeout; /* cached qs.timeout (ms) */
+ size_t frag_mtu; /* max FRCT pkt: PCI + payload */
+ uint16_t sack_n_max; /* SACK blocks that fit MTU */
+ bool stream;
+
+ /* All fields below are protected by lock (rwlock/LOAD_ACQUIRE). */
+ struct {
+ struct frct_cr snd_cr;
+ struct frct_cr rcv_cr;
+
+ /* RTT/RACK estimator */
+ time_t srtt; /* smoothed RTT */
+ time_t mdev; /* mean deviation */
+ time_t min_rtt; /* RACK base, ns */
+ uint64_t t_min_rtt; /* min_rtt last set */
+ time_t rto; /* retransmit TO */
+ time_t rto_min; /* RTO floor (ns) */
+ uint8_t rto_mul; /* RTO backoff bits */
+ uint32_t rtt_lwe; /* RTT-sample fence */
+ uint64_t t_rcv_rtt; /* last RTT feed */
+ uint64_t t_snd_probe; /* last probe sent */
+ uint64_t t_latest_ack; /* RACK.fack snd-ts */
+ uint32_t probe_id_next;
+ struct {
+ uint32_t id;
+ uint64_t ts; /* ts_to_ns send */
+ uint8_t nonce[RTTP_NONCE_LEN]; /* echoed back */
+ } probes[RTTP_RING];
+
+ /* rcv reassembly */
+ size_t max_rcv_sdu; /* max reasm bytes */
+ uint8_t * rcv_ring; /* lazy alloc */
+ size_t rcv_ring_sz; /* power of 2 */
+ uint32_t ring_seq_cap; /* ring/per_pkt */
+
+ uint32_t snd_byte_next;
+ bool snd_fin_sent;
+ uint32_t snd_fin_seqno;
+ uint32_t rcv_byte_next;
+ uint32_t rcv_byte_high; /* contiguous high */
+ uint32_t rcv_byte_fin; /* set when FIN */
+ bool rcv_fin_seen;
+
+ struct rcv_slot rcv_slots[RQ_SIZE];
+ struct snd_slot snd_slots[RQ_SIZE]; /* .rxm is ATOM */
+
+ /* rcv SACK dedup */
+ uint64_t t_snd_sack;
+ uint32_t sack_lwe; /* rcv lwe at SACK */
+ uint16_t sack_n; /* SACK block count */
+
+ /* RFC 2883 D-SACK: pending report (single-slot, latest). */
+ uint32_t dsack_seqno;
+ bool dsack_valid;
+
+ /* RFC 8985 §7.2 RACK reorder-window scaling. */
+ uint8_t reo_wnd_mult; /* REO_WND_MULT_MAX */
+ uint32_t dsack_lwe_snap; /* lwe @ last DSACK */
+ uint64_t t_last_reo_widen; /* once-per-RTT */
+
+ uint32_t dup_thresh; /* RFC 8985 */
+ uint32_t tlp_high_seq; /* §7.3: 0 = none */
+ uint8_t tlp_count; /* §7.3 per-episode */
+ uint64_t t_nack;
+ bool open; /* FC window state */
+ bool in_recovery;
+ uint32_t recovery_high; /* seqno @ entry */
+ uint32_t rack_fired_lwe; /* lwe @ last RACK */
+ struct timespec t_wnd; /* window-closed ts */
+ struct timespec t_last_rdv; /* last RDV sent */
+ struct list_head rxm_list; /* live rxm entries */
+
+ pthread_rwlock_t lock;
+ };
+
+ /* Read/written via __atomic without holding lock. */
+ uint64_t t_ka_rcv; /* ts_to_ns of last KA rx */
+ uint8_t ack_pending; /* delayed-ACK dedup */
+ uint8_t tlp_pending; /* TLP arm dedup (lazy) */
+
+ /* Timer entries; ownership belongs to the tw module. */
+ struct tw_entry ack_tw; /* delayed-ACK timer */
+ struct tw_entry ka_tw; /* keepalive timer */
+ struct tw_entry tlp_tw; /* tail-loss probe timer */
+
+#ifdef PROC_FLOW_STATS
+ /* STAT: lock-free relaxed atomic counters. */
+ struct frcti_stat stat;
+#endif
+};
+
+#ifdef PROC_FLOW_STATS
+
+__attribute__((cold))
static int frct_rib_read(const char * path,
char * buf,
size_t len)
{
+ struct frcti * frcti;
struct timespec now;
+ uint64_t now_ns;
char * entry;
- struct flow * flow;
- struct frcti * frcti;
int fd;
-
- (void) len;
+ int written;
+ /* Snapshot under the locks; format outside (pure userspace). */
+ struct {
+ uint64_t t_mpl;
+ uint64_t t_a;
+ uint64_t t_r;
+ time_t srtt;
+ time_t mdev;
+ time_t rto;
+ time_t min_rtt;
+ struct frct_cr snd_cr;
+ struct frct_cr rcv_cr;
+ size_t rx_q_now;
+ size_t tx_q_now;
+ struct frcti_stat stat;
+ } s;
entry = strstr(path, RIB_SEPARATOR);
assert(entry);
@@ -118,23 +437,50 @@ static int frct_rib_read(const char * path,
fd = atoi(path);
- flow = &proc.flows[fd];
-
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
+ return 0;
pthread_rwlock_rdlock(&proc.lock);
- frcti = flow->frcti;
+ frcti = proc.flows[fd].frcti;
+ if (frcti == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return 0;
+ }
+
+ s.t_mpl = frcti->t_mpl;
+ s.t_a = frcti->t_a;
+ s.t_r = frcti->t_r;
+
+ s.rx_q_now = proc.flows[fd].rx_rb != NULL
+ ? ssm_rbuff_queued(proc.flows[fd].rx_rb) : 0;
+ s.tx_q_now = proc.flows[fd].tx_rb != NULL
+ ? ssm_rbuff_queued(proc.flows[fd].tx_rb) : 0;
pthread_rwlock_rdlock(&frcti->lock);
- sprintf(buf,
- "Maximum packet lifetime (ns): %20ld\n"
- "Max time to Ack (ns): %20ld\n"
- "Max time to Retransmit (ns): %20ld\n"
+ s.srtt = frcti->srtt;
+ s.mdev = frcti->mdev;
+ s.rto = frcti->rto;
+ s.min_rtt = frcti->min_rtt;
+ s.snd_cr = frcti->snd_cr;
+ s.rcv_cr = frcti->rcv_cr;
+ s.stat = frcti->stat;
+
+ pthread_rwlock_unlock(&frcti->lock);
+ pthread_rwlock_unlock(&proc.lock);
+
+ written = snprintf(buf, len,
+ "Maximum packet lifetime (ns): %20" PRIu64 "\n"
+ "Max time to Ack (ns): %20" PRIu64 "\n"
+ "Max time to Retransmit (ns): %20" PRIu64 "\n"
"Smoothed rtt (ns): %20ld\n"
"RTT standard deviation (ns): %20ld\n"
"Retransmit timeout RTO (ns): %20ld\n"
+ "Minimum RTT (RACK base, ns): %20ld\n"
"Sender left window edge: %20u\n"
"Sender right window edge: %20u\n"
"Sender inactive (ns): %20lld\n"
@@ -143,44 +489,132 @@ static int frct_rib_read(const char * path,
"Receiver right window edge: %20u\n"
"Receiver inactive (ns): %20lld\n"
"Receiver last ack: %20u\n"
- "Number of pkt retransmissions: %20zu\n"
- "Number of rtt probes: %20zu\n"
- "Number of rtt estimates: %20zu\n"
- "Number of duplicates received: %20zu\n"
- "Number of delayed acks received: %20zu\n"
- "Number of rendez-vous sent: %20zu\n"
- "Number of packets out of window: %20zu\n"
- "Number of packets out of rqueue: %20zu\n",
- frcti->mpl,
- frcti->a,
- frcti->r,
- frcti->srtt,
- frcti->mdev,
- frcti->rto,
- frcti->snd_cr.lwe,
- frcti->snd_cr.rwe,
- ts_diff_ns(&now, &frcti->snd_cr.act),
- frcti->snd_cr.seqno,
- frcti->rcv_cr.lwe,
- frcti->rcv_cr.rwe,
- ts_diff_ns(&now, &frcti->rcv_cr.act),
- frcti->rcv_cr.seqno,
- frcti->n_rtx,
- frcti->n_prb,
- frcti->n_rtt,
- frcti->n_dup,
- frcti->n_dak,
- frcti->n_rdv,
- frcti->n_out,
- frcti->n_rqo);
-
- pthread_rwlock_unlock(&flow->frcti->lock);
+ "RXM (RTO-driven) sent: %20zu\n"
+ "RXM packets received: %20zu\n"
+ " duplicates received: %20zu\n"
+ "RXM (SACK mechanism) sent: %20zu\n"
+ "RXM (RACK-driven) sent: %20zu\n"
+ "RXM (DupThresh-driven) sent: %20zu\n"
+ "RXM (NACK-driven) sent: %20zu\n"
+ "ACK packets sent: %20zu\n"
+ "Delayed-ACK timer fires: %20zu\n"
+ " suppressed (seqno): %20zu\n"
+ " suppressed (inact): %20zu\n"
+ " suppressed (rate): %20zu\n"
+ "ACK packets received: %20zu\n"
+ " fed RTT estimator: %20zu\n"
+ " wire dups dropped: %20zu\n"
+ "Duplicates received: %20zu\n"
+ "Out-of-window pkts received: %20zu\n"
+ "Out-of-rqueue pkts received: %20zu\n"
+ "OOO arrivals: %20zu\n"
+ "SACKs sent: %20zu\n"
+ "SACKs received: %20zu\n"
+ "D-SACKs sent: %20zu\n"
+ "D-SACKs received: %20zu\n"
+ "D-SACK out-of-range dropped: %20zu\n"
+ "Pre-DRF NACKs sent: %20zu\n"
+ "Pre-DRF NACKs received: %20zu\n"
+ "Tail loss probes sent: %20zu\n"
+ "Inactivity drops (silent): %20zu\n"
+ "DRF window rebases: %20zu\n"
+ "rq slots cleared by release_rq: %20zu\n"
+ "RTT probes sent: %20zu\n"
+ "RTT probe replies received: %20zu\n"
+ "RTT estimator samples: %20zu\n"
+ "Rendez-vous packets sent: %20zu\n"
+ "Rendez-vous packets received: %20zu\n"
+ "Keepalives sent: %20zu\n"
+ "Keepalives received: %20zu\n"
+ "SDU writes fragmented: %20zu\n"
+ " alloc fail mid-SDU: %20zu\n"
+ " tx fail mid-SDU: %20zu\n"
+ "Fragments sent: %20zu\n"
+ "Fragments received: %20zu\n"
+ "SDUs delivered reassembled: %20zu\n"
+ "SDUs delivered (SOLE): %20zu\n"
+ "Fragments dropped (malformed): %20zu\n"
+ "Stream bytes sent: %20zu\n"
+ "Stream bytes received: %20zu\n"
+ "Stream bytes delivered: %20zu\n"
+ "Stream packets dropped: %20zu\n"
+ "Stream FINs dropped: %20zu\n"
+ "FRCTI_RCV time (ns): %20zu\n"
+ "tw_move time (ns): %20zu\n"
+ "drain_rx_nb calls: %20zu\n"
+ "RX rbuff queued: %20zu\n"
+ "TX rbuff queued: %20zu\n"
+ "RXM-due entries: %20zu\n"
+ " bail (acked): %20zu\n"
+ " bail (unowned): %20zu\n"
+ " bail (aged): %20zu\n"
+ " bail (defer): %20zu\n"
+ "RXM-arm malloc failures: %20zu\n"
+ "RXM cancels (teardown): %20zu\n"
+ "RXM tx into dead flow: %20zu\n"
+ "Tx ring drops (any cause): %20zu\n"
+ " ack: %20zu\n"
+ " sack: %20zu\n"
+ " ka: %20zu\n"
+ " rttp: %20zu\n"
+ " nack: %20zu\n"
+ " rdv: %20zu\n"
+ " other: %20zu\n",
+ /* Check getattr size below when adding stats. */
+ s.t_mpl, s.t_a, s.t_r,
+ s.srtt, s.mdev, s.rto, s.min_rtt,
+ s.snd_cr.lwe, s.snd_cr.rwe,
+ (long long)(now_ns - s.snd_cr.act),
+ s.snd_cr.seqno,
+ s.rcv_cr.lwe, s.rcv_cr.rwe,
+ (long long)(now_ns - s.rcv_cr.act),
+ s.rcv_cr.seqno,
+ s.stat.rxm_rto, s.stat.rxm_rcv, s.stat.rxm_dup_rcv,
+ s.stat.rxm_sack, s.stat.rxm_rack, s.stat.rxm_dupthresh,
+ s.stat.rxm_nack,
+ s.stat.ack_snd, s.stat.ack_fire,
+ s.stat.ack_supp_seqno, s.stat.ack_supp_inact,
+ s.stat.ack_supp_rate,
+ s.stat.ack_rcv, s.stat.ack_rtt, s.stat.ack_dup_rcv,
+ s.stat.dup_rcv, s.stat.out_rcv, s.stat.rqo_rcv,
+ s.stat.ooo_rcv,
+ s.stat.sack_snd, s.stat.sack_rcv,
+ s.stat.dsack_snd, s.stat.dsack_rcv, s.stat.dsack_drop,
+ s.stat.nack_snd, s.stat.nack_rcv, s.stat.tlp_snd,
+ s.stat.inact_drop, s.stat.drf_rebase, s.stat.rq_released,
+ s.stat.rttp_snd, s.stat.rttp_rcv, s.stat.rtt_smpl,
+ s.stat.rdv_snd, s.stat.rdv_rcv,
+ s.stat.ka_snd, s.stat.ka_rcv,
+ s.stat.sdu_snd_frag, s.stat.sdu_snd_alloc, s.stat.sdu_snd_tx,
+ s.stat.frag_snd, s.stat.frag_rcv,
+ s.stat.sdu_reasm, s.stat.sdu_sole, s.stat.frag_drop,
+ s.stat.strm_snd_byte, s.stat.strm_rcv_byte,
+ s.stat.strm_dlv_byte,
+ s.stat.strm_drop, s.stat.strm_fin_drop,
+ s.stat.rcv_proc_ns, s.stat.tw_move_ns,
+ s.stat.drain_calls,
+ s.rx_q_now, s.tx_q_now,
+ s.stat.rxm_due_count,
+ s.stat.rxm_due_acked, s.stat.rxm_due_unowned,
+ s.stat.rxm_due_aged, s.stat.rxm_due_defer,
+ s.stat.rxm_arm_fail,
+ s.stat.rxm_cancel,
+ s.stat.rxm_tx_dead, s.stat.tx_drop,
+ s.stat.tx_drop_ack, s.stat.tx_drop_sack,
+ s.stat.tx_drop_ka, s.stat.tx_drop_rttp,
+ s.stat.tx_drop_nack, s.stat.tx_drop_rdv,
+ s.stat.tx_drop_other);
+
+ if (written < 0)
+ return 0;
- pthread_rwlock_unlock(&proc.lock);
+ if ((size_t) written >= len)
+ return (int) (len - 1);
- return strlen(buf);
+ return written;
}
+__attribute__((cold))
static int frct_rib_readdir(char *** buf)
{
*buf = malloc(sizeof(**buf));
@@ -199,13 +633,14 @@ static int frct_rib_readdir(char *** buf)
return -ENOMEM;
}
+__attribute__((cold))
static int frct_rib_getattr(const char * path,
struct rib_attr * attr)
{
(void) path;
- (void) attr;
- attr->size = 1189;
+ /* Must be >= the sprintf output in frct_rib_read. */
+ attr->size = 8192;
attr->mtime = 0;
return 0;
@@ -220,128 +655,1172 @@ static struct rib_ops r_ops = {
#endif /* PROC_FLOW_STATS */
-static bool before(uint32_t seq1,
- uint32_t seq2)
+static __inline__ bool before(uint32_t s1, uint32_t s2)
{
- return (int32_t)(seq1 - seq2) < 0;
+ return (int32_t)(s1 - s2) < 0;
}
-static bool after(uint32_t seq1,
- uint32_t seq2)
+static __inline__ bool after(uint32_t s1, uint32_t s2)
{
- return (int32_t)(seq2 - seq1) < 0;
+ return (int32_t)(s2 - s1) < 0;
}
-static void __send_frct_pkt(int fd,
- uint8_t flags,
- uint32_t ackno,
- uint32_t rwe)
+static __inline__ bool within(uint32_t seq, uint32_t lo, uint32_t hi)
{
- struct ssm_pk_buff * spb;
- struct frct_pci * pci;
- ssize_t idx;
- struct flow * f;
+ return after(seq, lo) && !after(seq, hi);
+}
- /* Raw calls needed to bypass frcti. */
-#ifdef RXM_BLOCKING
- idx = ssm_pool_alloc_b(proc.pool, sizeof(*pci), NULL, &spb, NULL);
-#else
- idx = ssm_pool_alloc(proc.pool, sizeof(*pci), NULL, &spb);
-#endif
- if (idx < 0)
+static __inline__ bool in_window(uint32_t seq, const struct frct_cr * cr)
+{
+ return !before(seq, cr->lwe) && before(seq, cr->rwe);
+}
+
+/* DRF arrival that stays within the current receive epoch. */
+static __inline__ bool same_epoch_drf(uint32_t seq,
+ uint16_t flags,
+ const struct frct_cr * cr)
+{
+ if (cr->lwe == cr->rwe)
+ return false;
+
+ return (flags & FRCT_RXM) || in_window(seq, cr);
+}
+
+/*
+ * RACK reorder window R (RFC 8985 §6.2):
+ * R = MIN(reo_wnd_mult * RACK.min_RTT / 4, SRTT)
+ * reo_wnd_mult scales on D-SACK evidence of under-tolerance (§7.2).
+ * Fall back to srtt when no min_rtt sample exists yet; MIN_REORDER_NS
+ * floor guards collapse below the timer-tick resolution.
+ */
+static __inline__ uint64_t rack_reorder_window(struct frcti * frcti)
+{
+ uint64_t mult = frcti->reo_wnd_mult > 0 ? frcti->reo_wnd_mult : 1;
+ uint64_t base = frcti->min_rtt > 0 ? (uint64_t) frcti->min_rtt
+ : (uint64_t) frcti->srtt;
+ uint64_t R = mult * (base / 4);
+
+ R = MAX(R, (uint64_t) MIN_REORDER_NS);
+ R = MIN(R, (uint64_t) frcti->srtt);
+
+ return R;
+}
+
+static __inline__ int frct_spb_reserve(size_t len,
+ struct ssm_pk_buff ** spb)
+{
+ ssize_t idx = ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL);
+
+ return idx < 0 ? (int) idx : 0;
+}
+
+static __inline__ void frct_spb_release(struct ssm_pk_buff * spb)
+{
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
+}
+
+static __inline__ void frct_spb_release_idx(size_t idx)
+{
+ ssm_pool_remove(proc.pool, idx);
+}
+
+/* Fetch the spb stashed at the rq slot for seqno. */
+static __inline__ struct ssm_pk_buff * rq_frag(const struct frcti * frcti,
+ uint32_t seqno)
+{
+ return ssm_pool_get(proc.pool, frcti->rcv_slots[RQ_SLOT(seqno)].idx);
+}
+
+static __inline__ size_t frcti_data_hdr_len(const struct frcti * frcti)
+{
+ return FRCT_PCILEN + (frcti->stream ? FRCT_PCI_STREAM_LEN : 0);
+}
+
+static __inline__ size_t frcti_ctrl_hdr_len(const struct frcti * frcti)
+{
+ (void) frcti;
+
+ return FRCT_PCILEN;
+}
+
+/*
+ * HCS at offset 2 inside PCI. Covers flags (bytes 0..1) and
+ * window/seqno/ackno (bytes 4..15), plus SPCI for stream DATA.
+ */
+static void frct_hcs_set(struct frct_pci * pci,
+ bool stream)
+{
+ uint16_t hcs = 0;
+ size_t tail;
+
+ tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs);
+ if (stream)
+ tail += FRCT_PCI_STREAM_LEN;
+
+ crc16_ccitt_false(&hcs, pci, sizeof(pci->flags));
+ crc16_ccitt_false(&hcs, &pci->window, tail);
+
+ pci->hcs = hton16(hcs);
+}
+
+static int frct_hcs_check(const struct frct_pci * pci,
+ const struct frcti * frcti)
+{
+ uint16_t hcs = 0;
+ uint16_t flags;
+ size_t tail;
+
+ /* Untrusted flag read; mismatch on HCS will drop on corrupt. */
+ flags = ntoh16(pci->flags);
+
+ tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs);
+ if (frcti->stream && (flags & FRCT_DATA))
+ tail += FRCT_PCI_STREAM_LEN;
+
+ crc16_ccitt_false(&hcs, pci, sizeof(pci->flags));
+ crc16_ccitt_false(&hcs, &pci->window, tail);
+
+ return hcs != ntoh16(pci->hcs);
+}
+
+/* Bump tx_drop plus the per-frame-type counter matching `flags`. */
+static void frct_tx_drop_bump(struct frcti * frcti,
+ uint16_t flags)
+{
+ STAT_BUMP(frcti, tx_drop);
+
+ if (flags & FRCT_SACK) {
+ STAT_BUMP(frcti, tx_drop_sack);
return;
+ }
- pci = (struct frct_pci *) ssm_pk_buff_head(spb);
- memset(pci, 0, sizeof(*pci));
+ if (flags & FRCT_KA) {
+ STAT_BUMP(frcti, tx_drop_ka);
+ return;
+ }
- *((uint32_t *) pci) = hton32(rwe);
+ if (flags & FRCT_RTTP) {
+ STAT_BUMP(frcti, tx_drop_rttp);
+ return;
+ }
- pci->flags = flags;
- pci->ackno = hton32(ackno);
+ if (flags & FRCT_NACK) {
+ STAT_BUMP(frcti, tx_drop_nack);
+ return;
+ }
- f = &proc.flows[fd];
+ if (flags & FRCT_RDVS) {
+ STAT_BUMP(frcti, tx_drop_rdv);
+ return;
+ }
+
+ if (flags & FRCT_ACK) {
+ STAT_BUMP(frcti, tx_drop_ack);
+ return;
+ }
+
+ STAT_BUMP(frcti, tx_drop_other);
+}
+
+static int frct_tx(struct frcti * frcti, struct ssm_pk_buff * spb)
+{
+ struct flow * f = frcti_to_flow(frcti);
+ const struct frct_pci * pci;
+ const struct timespec * dl = NULL;
+ struct timespec now;
+ struct timespec intv = TIMESPEC_INIT_NS(FRCT_TX_TIMEO_NS);
+ struct timespec deadline;
+ uint16_t flags;
+ ssize_t idx;
+ int ret = -ENOMEM;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ flags = ntoh16(pci->flags);
+
+ /* CRC32 covers plaintext body; PCI is in HCS. Pre-encrypt. */
+ if (flags & FRCT_SACK) {
+ if (crc_add(spb, frcti_ctrl_hdr_len(frcti)) != 0)
+ goto fail;
+ } else if ((flags & FRCT_DATA) && f->info.qs.ber == 0) {
+ if (crc_add(spb, frcti_data_hdr_len(frcti)) != 0)
+ goto fail;
+ }
if (spb_encrypt(f, spb) < 0)
goto fail;
-#ifdef RXM_BLOCKING
- if (ssm_rbuff_write_b(f->tx_rb, idx, NULL))
-#else
- if (ssm_rbuff_write(f->tx_rb, idx))
-#endif
+ idx = ssm_pk_buff_get_off(spb);
+
+ /* DATA blocks; control times out so a full ring can't stall wheel. */
+ if (!(flags & FRCT_DATA)) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &intv, &deadline);
+ dl = &deadline;
+ }
+
+ ret = ssm_rbuff_write_b(f->tx_rb, idx, dl);
+ if (ret < 0)
goto fail;
ssm_flow_set_notify(f->set, f->info.id, FLOW_PKT);
- return;
+ return 0;
fail:
- ipcp_spb_release(spb);
- return;
+ frct_tx_drop_bump(frcti, flags);
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
+ return ret;
+}
+
+__attribute__((cold))
+static void frct_mark_flow_down(struct frcti * frcti)
+{
+ struct flow * f = frcti_to_flow(frcti);
+
+ if (f->rx_rb != NULL)
+ ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN);
+
+ if (f->tx_rb != NULL)
+ ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN);
+}
+
+__attribute__((cold))
+static void frct_mark_peer_dead(struct frcti * frcti)
+{
+ struct flow * f = frcti_to_flow(frcti);
+
+ if (f->rx_rb != NULL)
+ ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWPEER);
+
+ if (proc.fqset != NULL)
+ ssm_flow_set_notify(proc.fqset, f->info.id, FLOW_PEER);
+}
+
+static __inline__ int frct_ctrl_alloc(struct ssm_pk_buff ** spb,
+ struct frct_pci ** pci,
+ size_t payload_len)
+{
+ if (frct_spb_reserve(FRCT_PCILEN + payload_len, spb) < 0)
+ return -1;
+
+ *pci = (struct frct_pci *) ssm_pk_buff_head(*spb);
+ memset(*pci, 0, FRCT_PCILEN);
+
+ return 0;
+}
+
+/*
+ * Advertised rwe. Stream mode clamps to lwe + ring_seq_cap so the
+ * byte-equivalent fits the rx ring. Caller holds at least the rdlock.
+ */
+static __inline__ uint32_t frcti_advert_rwe(struct frcti * frcti)
+{
+ uint32_t rwe;
+ uint32_t cap;
+
+ rwe = frcti->rcv_cr.rwe;
+
+ if (!frcti->stream)
+ return rwe;
+
+ cap = frcti->rcv_cr.lwe + frcti->ring_seq_cap;
+
+ return before(cap, rwe) ? cap : rwe;
+}
+
+static void frcti_pkt_snd(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t ackno,
+ uint32_t rwe)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0)
+ return;
+
+ pci->flags = hton16(flags);
+ pci->window = hton32(rwe);
+ pci->ackno = hton32(ackno);
+ if (flags & FRCT_ACK) {
+ /* reuse ackno for the sequence number of delayed ACK */
+ ackno = FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1);
+ pci->seqno = hton32(ackno + 1);
+ }
+
+ frct_hcs_set(pci, false);
+
+ frct_tx(frcti, spb);
+}
+
+/* RTO floor scales with srtt; hard floor rto_min guards sub-ms RTT. */
+static void rtt_init(struct frcti * frcti,
+ time_t rtt_hint)
+{
+ time_t floor;
+
+ if (rtt_hint > 0) {
+ rtt_hint = MAX(rtt_hint, (time_t) RTT_BOOT_NS);
+ frcti->srtt = rtt_hint;
+ frcti->mdev = rtt_hint >> 3;
+ floor = MAX(frcti->rto_min, 2 * frcti->srtt);
+ frcti->rto = MAX(floor, rtt_hint + (frcti->mdev << MDEV_MUL));
+ frcti->min_rtt = rtt_hint;
+ } else {
+ /* Boot from first ACK. */
+ frcti->srtt = 0;
+ frcti->mdev = RTT_BOOT_NS;
+ frcti->rto = MAX((time_t) INITIAL_RTO, frcti->rto_min);
+ frcti->min_rtt = 0;
+ }
+
+ frcti->rto_mul = 0;
+}
+
+/* RFC 8985 §6.2: replace min_RTT on unset, smaller sample, or expiry. */
+static __inline__ bool min_rtt_stale(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ if (frcti->min_rtt == 0)
+ return true;
+
+ if (mrtt < frcti->min_rtt)
+ return true;
+
+ return ts_aged_ns(now_ns, frcti->t_min_rtt, MIN_RTT_WIN_NS);
+}
+
+/* Linux-style windowed-min refresh of RACK.min_RTT. */
+static __inline__ void min_rtt_update(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ if (!min_rtt_stale(frcti, mrtt, now_ns))
+ return;
+
+ frcti->min_rtt = mrtt;
+ frcti->t_min_rtt = now_ns;
+}
+
+static void rtt_update(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ time_t srtt = frcti->srtt;
+ time_t rttvar = frcti->mdev;
+ time_t floor;
+ time_t rto;
+
+ if (srtt == 0) {
+ srtt = mrtt;
+ rttvar = mrtt >> 1;
+ } else {
+ /* RFC 6298 symmetric EWMA. */
+ time_t delta = mrtt - srtt;
+ srtt += (delta >> 3);
+ delta = (ABS(delta) - rttvar) >> 2;
+#ifdef FRCT_LINUX_RTT_ESTIMATOR
+ if (delta < 0)
+ delta >>= 3;
+#endif
+ rttvar += delta;
+ }
+ STAT_BUMP(frcti, rtt_smpl);
+ frcti->srtt = MAX(SRTT_FLOOR_NS, srtt);
+ frcti->mdev = MAX(MDEV_FLOOR_NS, rttvar);
+
+ min_rtt_update(frcti, mrtt, now_ns);
+
+ floor = MAX(frcti->rto_min, 2 * frcti->srtt);
+ rto = MAX(floor, frcti->srtt + (frcti->mdev << MDEV_MUL));
+
+ STORE_RELEASE(&frcti->rto, rto);
+ STORE_RELEASE(&frcti->rto_mul, 0);
+}
+
+/* Fill probes[pos], return new probe_id; 0 on entropy failure. Wrlock. */
+static uint32_t rttp_alloc_probe(struct frcti * frcti,
+ uint64_t now_ns,
+ uint8_t nonce[RTTP_NONCE_LEN])
+{
+ uint32_t probe_id;
+ size_t pos;
+
+ if (random_buffer(nonce, RTTP_NONCE_LEN) < 0)
+ return 0;
+
+ probe_id = frcti->probe_id_next++;
+ if (probe_id == 0)
+ probe_id = frcti->probe_id_next++;
+
+ pos = RTTP_POS(probe_id);
+ frcti->probes[pos].id = probe_id;
+ frcti->probes[pos].ts = now_ns;
+ memcpy(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN);
+ frcti->t_snd_probe = now_ns;
+
+ STAT_BUMP(frcti, rttp_snd);
+
+ return probe_id;
+}
+
+/* Caller wrlock; out args valid on true (caller emits post-unlock). */
+static bool rtt_probe_arm(struct frcti * frcti,
+ uint64_t now_ns,
+ uint32_t * probe_id,
+ uint8_t nonce[RTTP_NONCE_LEN])
+{
+ if (frcti->srtt == 0)
+ return false;
+
+ if (!after(frcti->snd_cr.seqno, frcti->snd_cr.lwe))
+ return false;
+
+ if (!ts_aged_ns(now_ns, frcti->t_rcv_rtt,
+ 2u * (uint64_t) frcti->srtt))
+ return false;
+
+ if (!ts_aged_ns(now_ns, frcti->t_snd_probe,
+ (uint64_t) frcti->srtt))
+ return false;
+
+ *probe_id = rttp_alloc_probe(frcti, now_ns, nonce);
+
+ return *probe_id != 0;
+}
+
+static void frcti_rttp_snd(struct frcti * frcti,
+ uint32_t probe_id,
+ uint32_t echo_id,
+ const uint8_t * nonce)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ struct frct_rttp * rttp;
+
+ if (frct_ctrl_alloc(&spb, &pci, RTTP_PAYLOAD) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_RTTP);
+
+ frct_hcs_set(pci, false);
+
+ rttp = (struct frct_rttp *) FRCT_BODY(pci);
+ rttp->probe_id = hton32(probe_id);
+ rttp->echo_id = hton32(echo_id);
+ memcpy(rttp->nonce, nonce, sizeof(rttp->nonce));
+
+ frct_tx(frcti, spb);
+}
+
+struct rxm_entry {
+ struct tw_entry tw;
+ struct list_head next; /* in frcti->rxm_list */
+ struct frcti * frcti;
+ uint32_t seqno;
+ uint64_t t0;
+ size_t len;
+ uint8_t pkt[]; /* flexible — sized at alloc time */
+};
+
+static struct rxm_entry * rxm_entry_create(struct frcti * frcti,
+ uint32_t seqno,
+ const struct ssm_pk_buff * spb)
+{
+ struct rxm_entry * r;
+ struct timespec now;
+ size_t len = ssm_pk_buff_len(spb);
+
+ r = malloc(sizeof(*r) + len);
+ if (r == NULL) {
+ STAT_BUMP(frcti, rxm_arm_fail);
+ return NULL;
+ }
+
+ memcpy(r->pkt, ssm_pk_buff_head(spb), len);
+ r->len = len;
+ r->frcti = frcti;
+ r->seqno = seqno;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ r->t0 = TS_TO_UINT64(now);
+
+ tw_init_entry(&r->tw);
+
+ return r;
+}
+
+static void rxm_entry_destroy(struct rxm_entry * r)
+{
+ free(r);
+}
+
+static bool rxm_still_owned(struct frcti * frcti,
+ size_t pos,
+ struct rxm_entry * r)
+{
+ return LOAD_ACQUIRE(&frcti->snd_slots[pos].rxm) == r;
+}
+
+/*
+ * All in-flight slots share the HoL backoff; otherwise non-HoL timers
+ * cycle at base RTO and storm the wire while HoL is still backing off.
+ */
+static uint64_t rxm_next_deadline(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ time_t rto = LOAD_RELAXED(&frcti->rto);
+ uint8_t rto_mul = LOAD_RELAXED(&frcti->rto_mul);
+
+ return now_ns + ((uint64_t) rto << rto_mul);
+}
+
+/* Copy pkt, set FRCT_RXM, refresh ackno, re-seal HCS. */
+static struct ssm_pk_buff * rxm_pkt_prepare(const void * pkt,
+ size_t len,
+ uint32_t rcv_lwe,
+ bool stream)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ uint16_t flags;
+
+ if (frct_spb_reserve(len, &spb) < 0)
+ return NULL;
+
+ pci = (struct frct_pci *) ssm_pk_buff_head(spb);
+ memcpy(pci, pkt, len);
+
+ flags = ntoh16(pci->flags) | FRCT_RXM;
+ pci->flags = hton16(flags);
+ pci->ackno = hton32(rcv_lwe);
+
+ frct_hcs_set(pci, stream);
+
+ return spb;
+}
+
+/* Caller must NOT hold frcti->lock. */
+static void rxm_snd(struct frcti * frcti,
+ uint32_t seqno,
+ const void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ struct timespec now;
+ struct snd_slot * slot;
+ uint32_t snd_lwe;
+ uint32_t rcv_lwe;
+ size_t pos;
+ int ret;
+
+ snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe);
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ pos = RQ_SLOT(seqno);
+ slot = &frcti->snd_slots[pos];
+
+ slot->time = TS_TO_UINT64(now);
+ /* RTO supersedes any pending TLP/fast-rxm on this slot. */
+ slot->flags = (slot->flags & ~(SND_FAST_RXM | SND_TLP)) | SND_RTX;
+ /* §7.3: RTO supersedes TLP probes and ends the probe episode. */
+ frcti->tlp_high_seq = 0;
+ frcti->tlp_count = 0;
+
+ frcti->rtt_lwe = seqno + 1;
+
+ /* Only the HoL retransmit bumps the global RTO backoff. */
+ if (seqno == snd_lwe && frcti->rto_mul < MAX_RTO_MUL)
+ STORE_RELEASE(&frcti->rto_mul, frcti->rto_mul + 1);
+
+ /* RFC 8985 §7.2 step 4: RTO on HoL resets RACK reo scaling. */
+ if (seqno == snd_lwe)
+ frcti->reo_wnd_mult = 1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ STAT_BUMP(frcti, rxm_rto);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
+ return;
+
+ /* ETIMEDOUT/ENOMEM: let r-timer drive teardown. */
+ ret = frct_tx(frcti, spb);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
}
-static void send_frct_pkt(struct frcti * frcti)
+static void rxm_due(void * arg)
+{
+ struct rxm_entry * r = arg;
+ struct frcti * frcti = r->frcti;
+ struct timespec now;
+ uint64_t now_ns;
+ uint32_t snd_lwe;
+ size_t pos = RQ_SLOT(r->seqno);
+
+ STAT_BUMP(frcti, rxm_due_count);
+
+ snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe);
+
+ /* Already ACK'd: expected for the steady-state majority. */
+ if (before(r->seqno, snd_lwe)) {
+ STAT_BUMP(frcti, rxm_due_acked);
+ goto cleanup;
+ }
+
+ /* SACK/RACK-cleared the slot (caller NULL'd snd_slots[pos].rxm). */
+ if (!rxm_still_owned(frcti, pos, r)) {
+ STAT_BUMP(frcti, rxm_due_unowned);
+ goto cleanup;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ /* R-timer expired: peer unreachable. */
+ if (RXM_AGED_OUT(r->t0, now_ns, frcti->t_r)) {
+ STAT_BUMP(frcti, rxm_due_aged);
+ frct_mark_flow_down(frcti);
+ goto cleanup;
+ }
+
+ /* HoL-only retx; defer at base rto so HoL transitions react. */
+ if (r->seqno != snd_lwe) {
+ STAT_BUMP(frcti, rxm_due_defer);
+ tw_post(&r->tw, now_ns + LOAD_RELAXED(&frcti->rto),
+ rxm_due, r);
+ return;
+ }
+
+ rxm_snd(frcti, r->seqno, r->pkt, r->len);
+
+ /* Re-check ownership: fire path may have replaced our entry. */
+ if (rxm_still_owned(frcti, pos, r)) {
+ uint64_t anchor;
+
+ /* Per-slot anchor breaks co-fire re-bin. */
+ anchor = frcti->snd_slots[pos].time;
+ tw_post(&r->tw, rxm_next_deadline(frcti, anchor), rxm_due, r);
+ return;
+ }
+
+ cleanup:
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (rxm_still_owned(frcti, pos, r))
+ STORE_RELEASE(&frcti->snd_slots[pos].rxm, NULL);
+
+ list_del(&r->next);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ rxm_entry_destroy(r);
+}
+
+static int rxm_arm(struct frcti * frcti,
+ uint32_t seqno,
+ const struct ssm_pk_buff * spb)
+{
+ struct rxm_entry * r;
+ time_t rto;
+ uint8_t rto_mul;
+ uint64_t deadline;
+
+ r = rxm_entry_create(frcti, seqno, spb);
+ if (r == NULL)
+ return -ENOMEM;
+
+ rto = LOAD_RELAXED(&frcti->rto);
+ rto_mul = LOAD_RELAXED(&frcti->rto_mul);
+ deadline = r->t0 + ((uint64_t) rto << rto_mul);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ list_add_tail(&r->next, &frcti->rxm_list);
+ STORE_RELEASE(&frcti->snd_slots[RQ_SLOT(seqno)].rxm, r);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ tw_post(&r->tw, deadline, rxm_due, r);
+
+ return 0;
+}
+
+static void rxm_cancel_all(struct frcti * frcti)
+{
+ struct list_head * p;
+ struct list_head * t;
+
+ list_for_each_safe(p, t, &frcti->rxm_list) {
+ struct rxm_entry * r = list_entry(p, struct rxm_entry, next);
+ list_del(&r->next);
+ tw_cancel(&r->tw);
+ rxm_entry_destroy(r);
+ STAT_BUMP(frcti, rxm_cancel);
+ }
+}
+
+static __inline__ void sack_block_put(uint8_t * payload,
+ uint16_t i,
+ uint32_t s,
+ uint32_t e)
+{
+ uint32_t * blk = (uint32_t *)
+ (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE);
+
+ blk[0] = hton32(s);
+ blk[1] = hton32(e);
+}
+
+static __inline__ void sack_block_get(const uint8_t * payload,
+ uint16_t i,
+ uint32_t * s,
+ uint32_t * e)
+{
+ const uint32_t * blk = (const uint32_t *)
+ (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE);
+
+ *s = ntoh32(blk[0]);
+ *e = ntoh32(blk[1]);
+}
+
+/*
+ * Build SACK blocks for ranges *above* rcv_cr.lwe. Wire invariant
+ * (see doc/frct.txt §1.3): every block produced here satisfies
+ * blocks[i].start > rcv_cr.lwe = ackno, which makes the "first block
+ * below ackno" convention used to mark a D-SACK (RFC 2883 §4 case 1)
+ * unambiguous. Caller holds frcti->lock.
+ */
+static uint16_t sack_blocks_build(struct frcti * frcti,
+ uint32_t blocks[][2],
+ uint16_t max_n)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ uint32_t s;
+ uint32_t end;
+ uint16_t n = 0;
+
+ s = frcti->rcv_cr.lwe + 1;
+ end = frcti->rcv_cr.lwe + RQ_SIZE;
+ if (after(end, frcti->rcv_cr.rwe))
+ end = frcti->rcv_cr.rwe;
+
+ while (before(s, end) && n < max_n) {
+ while (before(s, end) && slots[RQ_SLOT(s)].idx == -1)
+ ++s;
+
+ if (!before(s, end))
+ break;
+
+ blocks[n][0] = s;
+ while (before(s, end) && slots[RQ_SLOT(s)].idx != -1)
+ ++s;
+ blocks[n][1] = s;
+ ++n;
+ }
+
+ return n;
+}
+
+/*
+ * Prepend the pending D-SACK report (if any) as block[0]; clear flag.
+ * Returns the number of slots consumed at the head (0 or 1). Caller
+ * holds wrlock.
+ */
+static __inline__ uint16_t dsack_consume(struct frcti * frcti,
+ uint32_t blocks[][2])
+{
+ if (!frcti->dsack_valid || frcti->sack_n_max == 0)
+ return 0;
+
+ blocks[0][0] = frcti->dsack_seqno;
+ blocks[0][1] = frcti->dsack_seqno + 1;
+ frcti->dsack_valid = false;
+ return 1;
+}
+
+/* Caller must NOT hold frcti->lock. */
+static void frcti_sack_snd(struct frcti * frcti,
+ const struct sack_args * sa)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ buffer_t buf;
+ uint16_t i;
+
+ assert(sa->n <= SACK_MAX_BLOCKS);
+
+ buf.len = SACK_HDR_SIZE + sa->n * SACK_BLOCK_SIZE;
+
+ if (frct_ctrl_alloc(&spb, &pci, buf.len) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_ACK | FRCT_FC | FRCT_SACK);
+ pci->window = hton32(sa->rwe);
+ pci->ackno = hton32(sa->ack);
+ pci->seqno = hton32(FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1) + 1);
+
+ frct_hcs_set(pci, false);
+
+ buf.data = FRCT_BODY(pci);
+ memset(buf.data, 0, SACK_HDR_SIZE);
+ *(uint16_t *) buf.data = hton16(sa->n);
+ for (i = 0; i < sa->n; ++i)
+ sack_block_put(buf.data, i, sa->blocks[i][0], sa->blocks[i][1]);
+
+ frct_tx(frcti, spb);
+}
+
+static void ack_snd(struct frcti * frcti,
+ bool with_sack)
{
struct timespec now;
+ uint64_t now_ns;
time_t diff;
uint32_t ackno;
uint32_t rwe;
- int fd;
+ struct sack_args * sa = NULL;
+ size_t sa_sz;
+ bool sacking = false;
assert(frcti);
+ STAT_BUMP(frcti, ack_fire);
+
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (with_sack && frcti->sack_n_max > 0) {
+ sa_sz = sizeof(*sa) + frcti->sack_n_max * sizeof(sa->blocks[0]);
+ sa = malloc(sa_sz);
+ /* If alloc fails, fall through and send a bare cum-ACK. */
+ }
pthread_rwlock_wrlock(&frcti->lock);
- if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)) {
+ /* D-SACK rides through cum-ACK freshness; signal is the duplicate. */
+ if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)
+ && !frcti->dsack_valid) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_seqno);
+ goto out;
}
- fd = frcti->fd;
ackno = frcti->rcv_cr.lwe;
- rwe = frcti->rcv_cr.rwe;
+ rwe = frcti_advert_rwe(frcti);
- diff = ts_diff_ns(&now, &frcti->rcv_cr.act);
- if (diff > frcti->a) {
+ if (ACK_AGED_OUT(frcti->rcv_cr.act, now_ns, frcti->t_a)) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_inact);
+ goto out;
}
- diff = ts_diff_ns(&now, &frcti->snd_cr.act);
- if (diff < TICTIME) {
+ diff = (time_t) ts_age_ns(now_ns, frcti->snd_cr.act);
+ if (diff < TICTIME && !frcti->dsack_valid) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_rate);
+ goto out;
}
+ /* RFC 2018: piggyback SACK on timer ACK; dedup unchanged board. */
+ if (sa == NULL || (frcti->sack_n == 0 && !frcti->dsack_valid))
+ goto no_sack;
+
+ sa->dsack = false;
+ sa->n = dsack_consume(frcti, sa->blocks);
+ if (sa->n == 1)
+ sa->dsack = true;
+
+ sa->n += sack_blocks_build(frcti, sa->blocks + sa->n,
+ frcti->sack_n_max - sa->n);
+ if (sa->n == 0)
+ goto no_sack;
+
+ if (!sa->dsack && ackno == frcti->sack_lwe && sa->n == frcti->sack_n)
+ goto no_sack;
+
+ sa->ack = ackno;
+ sa->rwe = rwe;
+ frcti->sack_lwe = ackno;
+ frcti->sack_n = sa->n;
+ frcti->t_snd_sack = now_ns;
+ sacking = true;
+
+ no_sack:
frcti->rcv_cr.seqno = frcti->rcv_cr.lwe;
pthread_rwlock_unlock(&frcti->lock);
- __send_frct_pkt(fd, FRCT_ACK | FRCT_FC, ackno, rwe);
+ STAT_BUMP(frcti, ack_snd);
+
+ if (sacking) {
+ STAT_BUMP(frcti, sack_snd);
+ if (sa->dsack)
+ STAT_BUMP(frcti, dsack_snd);
+ frcti_sack_snd(frcti, sa);
+ } else {
+ frcti_pkt_snd(frcti, FRCT_ACK | FRCT_FC, ackno, rwe);
+ }
+
+ out:
+ free(sa);
}
-static void __send_rdv(int fd)
+/* Delayed-ACK timer: per-flow, dedup'd via atomic test-and-set. */
+static void ack_due(void * arg)
{
- __send_frct_pkt(fd, FRCT_RDVS, 0, 0);
+ struct frcti * frcti = arg;
+
+ __atomic_clear(&frcti->ack_pending, __ATOMIC_RELAXED);
+
+ ack_snd(frcti, true);
}
-static struct frcti * frcti_create(int fd,
- time_t a,
- time_t r,
- time_t mpl)
+static int ack_arm(struct frcti * frcti)
{
- struct frcti * frcti;
- ssize_t idx;
- struct timespec now;
- pthread_condattr_t cattr;
+ struct timespec now;
+ uint64_t deadline;
+
+ if (__atomic_test_and_set(&frcti->ack_pending, __ATOMIC_RELAXED))
+ return 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ deadline = TS_TO_UINT64(now) + ACK_DELAY_NS;
+
+ tw_post(&frcti->ack_tw, deadline, ack_due, frcti);
+
+ return 0;
+}
+
+/* Forward decl breaks the keepalive cycle: ka_arm <-> ka_due. */
+static void ka_due(void * arg);
+
+static int ka_arm(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t timeo_ns;
+ uint64_t snd_ns;
+ uint64_t rcv_ns;
+ uint64_t deadline;
+
+ timeo_ns = (uint64_t) frcti->qs_timeout * MILLION; /* IMM */
+ snd_ns = LOAD_RELAXED(&frcti->snd_cr.act) + timeo_ns / 4;
+ rcv_ns = LOAD_RELAXED(&frcti->rcv_cr.act) + timeo_ns;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+ deadline = MIN(snd_ns, rcv_ns);
+ if (deadline <= now_ns)
+ deadline = now_ns + timeo_ns / 4;
+
+ tw_post(&frcti->ka_tw, deadline, ka_due, frcti);
+
+ return 0;
+}
+
+__attribute__((cold))
+static void ka_snd(struct frcti * frcti)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ struct timespec now;
+ uint64_t now_ns;
+ time_t timeo_ns;
+ uint64_t rcv_act;
+ uint64_t ka_rcv;
+ int64_t rcv_idle;
+ int64_t snd_idle;
+ uint32_t ackno;
+
+ assert(frcti);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ timeo_ns = (time_t)(frcti->qs_timeout) * MILLION; /* IMM */
+ rcv_act = LOAD_RELAXED(&frcti->rcv_cr.act);
+ ka_rcv = LOAD_RELAXED(&frcti->t_ka_rcv);
+ rcv_idle = ts_age_ns(now_ns, rcv_act > ka_rcv ? rcv_act : ka_rcv);
+ snd_idle = ts_age_ns(now_ns, LOAD_RELAXED(&frcti->snd_cr.act));
+
+ if (rcv_idle > timeo_ns) {
+ frct_mark_peer_dead(frcti);
+ return;
+ }
+
+ if (snd_idle <= timeo_ns / 4) {
+ ka_arm(frcti);
+ return;
+ }
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0) {
+ ka_arm(frcti);
+ return;
+ }
+
+ ackno = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ pci->flags = hton16(FRCT_KA | FRCT_ACK);
+ pci->ackno = hton32(ackno);
+
+ frct_hcs_set(pci, false);
+
+ STAT_BUMP(frcti, ka_snd);
+ frct_tx(frcti, spb);
+
+ ka_arm(frcti);
+}
+
+/* Keepalive timer: re-posted by the fire callback itself. */
+static void ka_due(void * arg)
+{
+ ka_snd((struct frcti *) arg);
+}
+
+static void frcti_rdv_snd(struct frcti * frcti)
+{
+ frcti_pkt_snd(frcti, FRCT_RDVS, 0, 0);
+}
+
+#define HAS_RESCNTL(cr) ((cr)->cflags & FRCTFRESCNTL)
+static bool frcti_is_window_open(struct frcti * frcti)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+ struct timespec now;
+ time_t diff;
+ bool ret = false;
+
+ if (!HAS_RESCNTL(snd_cr))
+ return true;
+
+ if (before(snd_cr->seqno, LOAD_RELAXED(&snd_cr->rwe)))
+ return true;
+
+ /* Window may be closed; wrlock for RDV state mutations. */
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (before(snd_cr->seqno, snd_cr->rwe)) {
+ ret = true;
+ goto unlock;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ if (frcti->open) {
+ frcti->open = false;
+ frcti->t_wnd = now;
+ frcti->t_last_rdv = now;
+ goto unlock;
+ }
+
+ diff = ts_diff_ns(&now, &frcti->t_wnd);
+ if (diff > MAX_RDV)
+ goto unlock;
+
+ diff = ts_diff_ns(&now, &frcti->t_last_rdv);
+ if (diff > (time_t) frcti->t_rdv) {
+ frcti->t_last_rdv = now;
+ frcti_rdv_snd(frcti);
+ STAT_BUMP(frcti, rdv_snd);
+ }
+ unlock:
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ret;
+}
+
+/* n contiguous seqnos free? No RDV: the n=1 path drives it. */
+static bool frcti_is_window_open_n(struct frcti * frcti,
+ size_t n)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+
+ if (!HAS_RESCNTL(snd_cr))
+ return true;
+
+ if (n <= 1)
+ return frcti_is_window_open(frcti);
+
+ return before(snd_cr->seqno + (uint32_t)(n - 1),
+ LOAD_RELAXED(&snd_cr->rwe));
+}
+
+static void release_rq(struct frcti * frcti)
+{
+ size_t i;
+
+ for (i = 0; i < RQ_SIZE; ++i) {
+ if (frcti->rcv_slots[i].idx == -1)
+ continue;
+
+ /* Stream rq entries are sentinels (no spb owned). */
+ if (!frcti->stream)
+ frct_spb_release_idx(frcti->rcv_slots[i].idx);
+
+ frcti->rcv_slots[i].idx = -1;
+ STAT_BUMP(frcti, rq_released);
+ }
+}
+
+static __inline__ bool stream_ring_sz_ok(struct frcti * frcti,
+ size_t n)
+{
+ size_t per_pkt;
+
+ if (n > FRCT_STREAM_RING_SZ_MAX)
+ return false;
+
+ if ((n & (n - 1)) != 0)
+ return false;
+
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+
+ return n >= FRCT_STREAM_RING_MIN_PKTS * per_pkt;
+}
+
+/* Default ring sized for full RQ_SIZE seqno window; pow2, capped. */
+static size_t default_stream_ring_sz(size_t per_pkt)
+{
+ size_t need;
+ size_t sz;
+
+ need = (size_t) RQ_SIZE * per_pkt;
+ sz = FRCT_STREAM_RING_SZ;
+
+ while (sz < need && sz < FRCT_STREAM_RING_SZ_MAX)
+ sz <<= 1;
+
+ return sz;
+}
+
+struct frcti * frcti_create(int fd,
+ uint64_t a,
+ uint64_t r,
+ uint64_t mpl,
+ time_t rtt_hint,
+ qosspec_t qs,
+ uint32_t mtu)
+{
+ struct frcti * frcti;
+ ssize_t idx;
+ struct timespec now;
+ uint64_t now_ns;
+ size_t bb;
+ size_t per_pkt;
#ifdef PROC_FLOW_STATS
- char frctstr[FRCT_NAME_STRLEN + 1];
+ char frctstr[FRCT_NAME_STRLEN + 1];
#endif
- mpl *= MILLION;
- a *= BILLION;
- r *= BILLION;
+ mpl *= MILLION; /* ms -> ns */
+ a *= MILLION; /* ms -> ns */
+ r *= MILLION; /* ms -> ns */
frcti = malloc(sizeof(*frcti));
if (frcti == NULL)
@@ -349,56 +1828,76 @@ static struct frcti * frcti_create(int fd,
memset(frcti, 0, sizeof(*frcti));
+ list_head_init(&frcti->rxm_list);
+
if (pthread_rwlock_init(&frcti->lock, NULL))
goto fail_lock;
- if (pthread_mutex_init(&frcti->mtx, NULL))
- goto fail_mutex;
-
- if (pthread_condattr_init(&cattr))
- goto fail_cattr;
-#ifndef __APPLE__
- pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK);
-#endif
- if (pthread_cond_init(&frcti->cond, &cattr))
- goto fail_cond;
-
#ifdef PROC_FLOW_STATS
sprintf(frctstr, "%d", fd);
if (rib_reg(frctstr, &r_ops))
goto fail_rib_reg;
#endif
- pthread_condattr_destroy(&cattr);
for (idx = 0; idx < RQ_SIZE; ++idx)
- frcti->rq[idx] = -1;
+ frcti->rcv_slots[idx].idx = -1;
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ frcti->t_mpl = mpl;
+ frcti->t_a = a;
+ frcti->t_r = r;
+ frcti->t_rdv = DELT_RDV;
+ frcti->fd = fd;
+ frcti->ber = (time_t) qs.ber;
+ frcti->lossy = (qs.loss != 0);
+ frcti->qs_timeout = (time_t) qs.timeout;
+
+ frcti->frag_mtu = (size_t) mtu;
+
+ /* Cap blocks per SACK at what fits in the per-flow frag_mtu. */
+ bb = (frcti->frag_mtu - FRCT_PCILEN - SACK_HDR_SIZE)
+ / SACK_BLOCK_SIZE;
+ if (bb > SACK_MAX_BLOCKS)
+ bb = SACK_MAX_BLOCKS;
+ frcti->sack_n_max = (uint16_t) bb;
+
+ frcti->max_rcv_sdu = FRCT_MAX_SDU;
+
+ frcti->stream = (qs.service == SVC_STREAM);
+ if (frcti->stream) {
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+ frcti->rcv_ring_sz = default_stream_ring_sz(per_pkt);
+ frcti->ring_seq_cap =
+ (uint32_t) (frcti->rcv_ring_sz / per_pkt);
+ }
- frcti->mpl = mpl;
- frcti->a = a;
- frcti->r = r;
- frcti->rdv = DELT_RDV;
- frcti->fd = fd;
-
-
- frcti->rttseq = 0;
- frcti->probe = false;
-
- frcti->srtt = 0; /* Updated on first ACK */
- frcti->mdev = 10 * MILLION; /* Updated on first ACK */
- frcti->rto = BILLION; /* Initial rxm will be after 1 s */
-#ifdef PROC_FLOW_STATS
- frcti->n_rtx = 0;
- frcti->n_prb = 0;
- frcti->n_rtt = 0;
- frcti->n_dup = 0;
- frcti->n_dak = 0;
- frcti->n_rdv = 0;
- frcti->n_out = 0;
- frcti->n_rqo = 0;
-#endif
- if (proc.flows[fd].info.qs.loss == 0) {
+ frcti->rto_min = (time_t) MAX(RTO_MIN, 1ULL << RXMQ_RES);
+ rtt_init(frcti, rtt_hint);
+ frcti->t_min_rtt = now_ns;
+ frcti->probe_id_next = 1;
+ frcti->t_rcv_rtt = now_ns;
+ frcti->t_snd_probe = now_ns;
+ frcti->t_snd_sack = 0;
+ frcti->sack_lwe = 0;
+ frcti->sack_n = 0;
+ frcti->dsack_seqno = 0;
+ frcti->dsack_valid = false;
+ frcti->reo_wnd_mult = 1;
+ frcti->dsack_lwe_snap = 0;
+ frcti->t_last_reo_widen = 0;
+ /* So the first pre-DRF NACK fires without waiting cooldown. */
+ frcti->t_nack = now_ns - BILLION;
+ frcti->in_recovery = false;
+ frcti->recovery_high = 0;
+ frcti->rack_fired_lwe = 0;
+
+ tw_init_entry(&frcti->ack_tw);
+ tw_init_entry(&frcti->ka_tw);
+ tw_init_entry(&frcti->tlp_tw);
+
+ if (!frcti->lossy) {
frcti->snd_cr.cflags |= FRCTFRTX | FRCTFLINGER;
frcti->rcv_cr.cflags |= FRCTFRTX;
}
@@ -406,24 +1905,31 @@ static struct frcti * frcti_create(int fd,
frcti->snd_cr.cflags |= FRCTFRESCNTL;
frcti->snd_cr.rwe = START_WINDOW;
+ if (frcti->lossy)
+ frcti->snd_cr.rwe = RQ_SIZE;
- frcti->snd_cr.inact = (3 * mpl + a + r) / BILLION + 1; /* s */
- frcti->snd_cr.act.tv_sec = now.tv_sec - (frcti->snd_cr.inact + 1);
+ frcti->snd_cr.inact = 3 * mpl + a + r + BILLION; /* ns */
+ frcti->snd_cr.act = now_ns - frcti->snd_cr.inact - BILLION;
- frcti->rcv_cr.inact = (2 * mpl + a + r) / BILLION + 1; /* s */
- frcti->rcv_cr.act.tv_sec = now.tv_sec - (frcti->rcv_cr.inact + 1);
+ frcti->rcv_cr.inact = 2 * mpl + a + r + BILLION; /* ns */
+ frcti->rcv_cr.act = now_ns - frcti->rcv_cr.inact - BILLION;
+
+ frcti->t_ka_rcv = now_ns;
+
+ /* qs_timeout == 0: no KA, silent peer crash goes undetected. */
+ if (frcti->qs_timeout > 0) {
+ if (ka_arm(frcti) < 0)
+ goto fail_ka_arm;
+ }
return frcti;
+ fail_ka_arm:
#ifdef PROC_FLOW_STATS
+ sprintf(frctstr, "%d", fd);
+ rib_unreg(frctstr);
fail_rib_reg:
- pthread_cond_destroy(&frcti->cond);
#endif
- fail_cond:
- pthread_condattr_destroy(&cattr);
- fail_cattr:
- pthread_mutex_destroy(&frcti->mtx);
- fail_mutex:
pthread_rwlock_destroy(&frcti->lock);
fail_lock:
free(frcti);
@@ -431,21 +1937,55 @@ static struct frcti * frcti_create(int fd,
return NULL;
}
-static void frcti_destroy(struct frcti * frcti)
+void frcti_destroy(struct frcti * frcti)
{
#ifdef PROC_FLOW_STATS
char frctstr[FRCT_NAME_STRLEN + 1];
+#endif
+ /* Drop every wheel entry referencing frcti before freeing it. */
+ rxm_cancel_all(frcti);
+ tw_cancel(&frcti->ack_tw);
+ tw_cancel(&frcti->ka_tw);
+ tw_cancel(&frcti->tlp_tw);
+
+#if defined(PROC_FLOW_STATS) && defined(FRCT_DEBUG_STDOUT)
+ printf("[FRCT teardown] pid=%d fd=%d "
+ "sdu_snd=%zu sdu_reasm=%zu sdu_sole=%zu "
+ "frag_snd=%zu frag_rcv=%zu frag_drop=%zu "
+ "rxm_rto=%zu rxm_sack=%zu rxm_dup=%zu "
+ "rxm_due=%zu acked=%zu unowned=%zu aged=%zu defer=%zu "
+ "cancel=%zu arm_fail=%zu inflight=%u "
+ "nack_snd=%zu nack_rcv=%zu inact_drop=%zu "
+ "drf_rebase=%zu rq_released=%zu\n",
+ (int) getpid(), frcti->fd,
+ frcti->stat.sdu_snd_frag, frcti->stat.sdu_reasm,
+ frcti->stat.sdu_sole,
+ frcti->stat.frag_snd, frcti->stat.frag_rcv,
+ frcti->stat.frag_drop,
+ frcti->stat.rxm_rto, frcti->stat.rxm_sack,
+ frcti->stat.rxm_dupthresh,
+ frcti->stat.rxm_due_count, frcti->stat.rxm_due_acked,
+ frcti->stat.rxm_due_unowned, frcti->stat.rxm_due_aged,
+ frcti->stat.rxm_due_defer,
+ frcti->stat.rxm_cancel, frcti->stat.rxm_arm_fail,
+ frcti->snd_cr.seqno - frcti->snd_cr.lwe,
+ frcti->stat.nack_snd, frcti->stat.nack_rcv,
+ frcti->stat.inact_drop,
+ frcti->stat.drf_rebase, frcti->stat.rq_released);
+#endif
+
+ release_rq(frcti);
+ free(frcti->rcv_ring);
+#ifdef PROC_FLOW_STATS
sprintf(frctstr, "%d", frcti->fd);
rib_unreg(frctstr);
#endif
- pthread_cond_destroy(&frcti->cond);
- pthread_mutex_destroy(&frcti->mtx);
pthread_rwlock_destroy(&frcti->lock);
free(frcti);
}
-static uint16_t frcti_getflags(struct frcti * frcti)
+uint16_t frcti_getflags(struct frcti * frcti)
{
uint16_t ret;
@@ -453,89 +1993,91 @@ static uint16_t frcti_getflags(struct frcti * frcti)
pthread_rwlock_rdlock(&frcti->lock);
- ret = frcti->snd_cr.cflags;
+ ret = frcti->snd_cr.cflags & FRCTFMASK;
pthread_rwlock_unlock(&frcti->lock);
return ret;
}
-static void frcti_setflags(struct frcti * frcti,
- uint16_t flags)
+void frcti_setflags(struct frcti * frcti,
+ uint16_t flags)
{
- flags |= FRCTFRTX; /* Should not be set by command */
-
assert(frcti);
- pthread_rwlock_wrlock(&frcti->lock);
+ flags &= FRCTFSETMASK;
- frcti->snd_cr.cflags &= FRCTFRTX; /* Zero other flags */
+ pthread_rwlock_wrlock(&frcti->lock);
- frcti->snd_cr.cflags &= flags;
+ frcti->snd_cr.cflags = (frcti->snd_cr.cflags & ~FRCTFSETMASK) | flags;
pthread_rwlock_unlock(&frcti->lock);
}
-#define frcti_queued_pdu(frcti) \
- (frcti == NULL ? idx : __frcti_queued_pdu(frcti))
+size_t frcti_get_max_rcv_sdu(struct frcti * frcti)
+{
+ size_t ret;
-#define frcti_snd(frcti, spb) \
- (frcti == NULL ? 0 : __frcti_snd(frcti, spb))
+ assert(frcti);
-#define frcti_rcv(frcti, spb) \
- (frcti == NULL ? 0 : __frcti_rcv(frcti, spb))
+ pthread_rwlock_rdlock(&frcti->lock);
+ ret = frcti->max_rcv_sdu;
+ pthread_rwlock_unlock(&frcti->lock);
-#define frcti_dealloc(frcti) \
- (frcti == NULL ? 0 : __frcti_dealloc(frcti))
+ return ret;
+}
-#define frcti_is_window_open(frcti) \
- (frcti == NULL ? true : __frcti_is_window_open(frcti))
+int frcti_set_max_rcv_sdu(struct frcti * frcti,
+ size_t max)
+{
+ assert(frcti);
-#define frcti_window_wait(frcti, abstime) \
- (frcti == NULL ? 0 : __frcti_window_wait(frcti, abstime))
+ if (max == 0)
+ return -EINVAL;
+ pthread_rwlock_wrlock(&frcti->lock);
+ frcti->max_rcv_sdu = max;
+ pthread_rwlock_unlock(&frcti->lock);
-static bool __frcti_is_window_open(struct frcti * frcti)
+ return 0;
+}
+
+size_t frcti_get_rcv_ring_sz(struct frcti * frcti)
{
- struct frct_cr * snd_cr = &frcti->snd_cr;
- bool ret = true;
+ size_t ret;
+
+ assert(frcti);
pthread_rwlock_rdlock(&frcti->lock);
+ ret = frcti->rcv_ring_sz;
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ret;
+}
- if (snd_cr->cflags & FRCTFRESCNTL)
- ret = before(snd_cr->seqno, snd_cr->rwe);
+/* Set before any stream byte has been delivered; -EBUSY otherwise. */
+int frcti_set_rcv_ring_sz(struct frcti * frcti,
+ size_t n)
+{
+ int ret = 0;
+ size_t per_pkt;
- if (!ret) {
- struct timespec now;
+ assert(frcti);
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (!frcti->stream)
+ return -ENOTSUP;
+ if (!stream_ring_sz_ok(frcti, n))
+ return -EINVAL;
- pthread_mutex_lock(&frcti->mtx);
- if (frcti->open) {
- frcti->open = false;
- frcti->t_wnd = now;
- frcti->t_rdvs = now;
- } else {
- time_t diff;
- diff = ts_diff_ns(&now, &frcti->t_wnd);
- if (diff > MAX_RDV) {
- pthread_mutex_unlock(&frcti->mtx);
- pthread_rwlock_unlock(&frcti->lock);
- return false;
- }
-
- diff = ts_diff_ns(&now, &frcti->t_rdvs);
- if (diff > frcti->rdv) {
- frcti->t_rdvs = now;
- __send_rdv(frcti->fd);
-#ifdef PROC_FLOW_STATS
- frcti->n_rdv++;
-#endif
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
- }
- }
+ pthread_rwlock_wrlock(&frcti->lock);
- pthread_mutex_unlock(&frcti->mtx);
+ if (frcti->rcv_ring != NULL) {
+ ret = -EBUSY;
+ } else {
+ frcti->rcv_ring_sz = n;
+ frcti->ring_seq_cap = (uint32_t) (n / per_pkt);
}
pthread_rwlock_unlock(&frcti->lock);
@@ -543,392 +2085,2101 @@ static bool __frcti_is_window_open(struct frcti * frcti)
return ret;
}
-static int __frcti_window_wait(struct frcti * frcti,
- struct timespec * abstime)
+time_t frcti_get_rto_min(struct frcti * frcti)
{
- struct frct_cr * snd_cr = &frcti->snd_cr;
- int ret = 0;
+ time_t v;
+
+ assert(frcti);
pthread_rwlock_rdlock(&frcti->lock);
+ v = frcti->rto_min;
+ pthread_rwlock_unlock(&frcti->lock);
- if (!(snd_cr->cflags & FRCTFRESCNTL)) {
- pthread_rwlock_unlock(&frcti->lock);
+ return v;
+}
+
+/* Floor at the timer-wheel resolution; finer granularity is unrepresentable. */
+int frcti_set_rto_min(struct frcti * frcti,
+ time_t rto_min)
+{
+ time_t floor = (time_t) (1ULL << RXMQ_RES);
+ time_t rto_floor;
+ time_t rto;
+
+ assert(frcti);
+
+ if (rto_min < floor)
+ return -EINVAL;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ frcti->rto_min = rto_min;
+ if (frcti->srtt > 0) {
+ rto_floor = MAX(rto_min, 2 * frcti->srtt);
+ rto = MAX(rto_floor,
+ frcti->srtt + (frcti->mdev << MDEV_MUL));
+ STORE_RELEASE(&frcti->rto, rto);
+ } else if (frcti->rto < rto_min) {
+ STORE_RELEASE(&frcti->rto, rto_min);
+ }
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return 0;
+}
+
+/* Re-arm a fresh rxm so a lost fast-retx still recovers via RTO. */
+static void sack_rxm_snd(struct frcti * frcti,
+ void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ const struct frct_pci * pci;
+ uint32_t rcv_lwe;
+ uint32_t seqno;
+ int ret;
+
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
+ return;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ seqno = ntoh32(pci->seqno);
+
+ /* Register fresh rxm before send; old entry self-cleans. */
+ if (rxm_arm(frcti, seqno, spb) < 0) {
+ frct_spb_release(spb);
+ return;
+ }
+
+ STAT_BUMP(frcti, rxm_sack);
+ ret = frct_tx(frcti, spb);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+}
+
+/* Additive HoL emit; original snd_slots[hp].rxm stays armed (NewReno). */
+static int fast_rxm_send(struct frcti * frcti,
+ void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ uint32_t rcv_lwe;
+
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
return 0;
+
+ return frct_tx(frcti, spb);
+}
+
+/* PCI bytes survive head_release at receive; just rewind the pointer. */
+static __inline__ uint16_t frag_role_peek(struct ssm_pk_buff * spb)
+{
+ const struct frct_pci * pci;
+
+ assert(ssm_pk_buff_head(spb) != NULL);
+
+ pci = (const struct frct_pci *) (ssm_pk_buff_head(spb) - FRCT_PCILEN);
+
+ return ntoh16(pci->flags) & FRCT_FR_MASK;
+}
+
+enum frag_state {
+ FRAG_NOT_READY, /* head missing / FIRST..LAST run incomplete */
+ FRAG_DELIVER, /* *count fragments form a deliverable SDU */
+ FRAG_DROP, /* *count fragments at lwe are malformed */
+};
+
+/*
+ * On a gap in the run: FRTX waits (NOT_READY); best-effort scans forward
+ * for the next FIRST/SOLE and returns DROP for the broken prefix. *count
+ * gets the offset from the trailing edge. NOT_READY if no later run is
+ * in window. Caller rdlock.
+ */
+static enum frag_state frag_inspect_gap(struct frcti * frcti,
+ size_t start,
+ size_t * count)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ struct ssm_pk_buff * spb;
+ uint32_t k;
+ uint16_t role;
+ size_t m;
+
+ if (frcti->rcv_cr.cflags & FRCTFRTX)
+ return FRAG_NOT_READY;
+
+ k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ for (m = start; m < RQ_SIZE; ++m) {
+ if (slots[RQ_SLOT(k + m)].idx == -1)
+ continue;
+
+ spb = rq_frag(frcti, k + m);
+ role = frag_role_peek(spb);
+
+ if (role == FRCT_FR_SOLE || role == FRCT_FR_FIRST) {
+ if (m == 0)
+ return FRAG_NOT_READY;
+
+ *count = m;
+ return FRAG_DROP;
+ }
}
- while (snd_cr->seqno == snd_cr->rwe && ret != -ETIMEDOUT) {
- struct timespec now;
- pthread_rwlock_unlock(&frcti->lock);
- pthread_mutex_lock(&frcti->mtx);
+ return FRAG_NOT_READY;
+}
+
+/*
+ * Inspect rq[lwe..]; set *count and return DELIVER/DROP/NOT_READY. DROP
+ * covers broken prefixes (mid/last at HoL, FIRST..[non-LAST]..new-FIRST).
+ * Non-FRTX flows skip past gaps to the next FIRST/SOLE. Caller rdlock.
+ */
+static enum frag_state frag_run_inspect(struct frcti * frcti,
+ size_t * count)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ struct ssm_pk_buff * spb;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+ uint16_t role;
+ size_t n = 0;
- if (frcti->open) {
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (slots[RQ_SLOT(k)].idx == -1)
+ return frag_inspect_gap(frcti, 0, count);
- frcti->t_wnd = now;
- frcti->t_rdvs = now;
- frcti->open = false;
+ spb = rq_frag(frcti, k);
+ role = frag_role_peek(spb);
+
+ if (role == FRCT_FR_SOLE) {
+ *count = 1;
+ return FRAG_DELIVER;
+ }
+
+ if (role != FRCT_FR_FIRST) {
+ *count = 1;
+ return FRAG_DROP;
+ }
+
+ while (true) {
+ if (n == RQ_SIZE || slots[RQ_SLOT(k + n)].idx == -1)
+ return frag_inspect_gap(frcti, n, count);
+
+ spb = rq_frag(frcti, k + n);
+ role = frag_role_peek(spb);
+ ++n;
+
+ if (role == FRCT_FR_LAST) {
+ *count = n;
+ return FRAG_DELIVER;
}
- pthread_cleanup_push(__cleanup_mutex_unlock, &frcti->mtx);
+ if (n > 1 && role != FRCT_FR_MID) {
+ /* SOLE or new FIRST mid-run: drop the prefix. */
+ *count = n - 1;
+ return FRAG_DROP;
+ }
+ }
+}
- ret = -__timedwait(&frcti->cond, &frcti->mtx, abstime);
+/* Caller wrlock. Delivery edge is implicit: rwe - RQ_SIZE. */
+static void frag_drop(struct frcti * frcti,
+ size_t count)
+{
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+ uint32_t edge;
+ size_t i;
- pthread_cleanup_pop(false);
+ for (i = 0; i < count; ++i) {
+ size_t pos = RQ_SLOT(k + i);
- if (ret == -ETIMEDOUT) {
- time_t diff;
+ if (frcti->rcv_slots[pos].idx == -1)
+ continue;
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ frct_spb_release_idx(frcti->rcv_slots[pos].idx);
+ frcti->rcv_slots[pos].idx = -1;
+ }
- diff = ts_diff_ns(&now, &frcti->t_wnd);
- if (diff > MAX_RDV) {
- pthread_mutex_unlock(&frcti->mtx);
- return -ECONNRESET; /* write fails! */
- }
+ frcti->rcv_cr.rwe += count;
- diff = ts_diff_ns(&now, &frcti->t_rdvs);
- if (diff > frcti->rdv) {
- frcti->t_rdvs = now;
- __send_rdv(frcti->fd);
- }
+ /* Drop may span a gap; pull lwe up to preserve rwe - RQ_SIZE <= lwe. */
+ edge = frcti->rcv_cr.rwe - RQ_SIZE;
+ if (before(frcti->rcv_cr.lwe, edge))
+ STORE_RELEASE(&frcti->rcv_cr.lwe, edge);
+}
+
+/* Copy `count` fragments at rq[lwe..] into buf; release + advance lwe. */
+static size_t frag_gather(struct frcti * frcti,
+ size_t count,
+ uint8_t * buf)
+{
+ struct ssm_pk_buff * frag;
+ size_t off = 0;
+ size_t i;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ for (i = 0; i < count; ++i) {
+ size_t pos = RQ_SLOT(k + i);
+ size_t flen;
+
+ frag = rq_frag(frcti, k + i);
+ flen = ssm_pk_buff_len(frag);
+ memcpy(buf + off, ssm_pk_buff_head(frag), flen);
+ off += flen;
+ frct_spb_release_idx(frcti->rcv_slots[pos].idx);
+ frcti->rcv_slots[pos].idx = -1;
+ }
+
+ frcti->rcv_cr.rwe += count;
+
+ return off;
+}
+
+/* Caller holds lock. */
+static size_t frag_total_len(struct frcti * frcti,
+ size_t count,
+ bool * overflow)
+{
+ struct ssm_pk_buff * frag;
+ size_t total = 0;
+ size_t i;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ *overflow = false;
+
+ for (i = 0; i < count; ++i) {
+ size_t flen;
+
+ frag = rq_frag(frcti, k + i);
+ flen = ssm_pk_buff_len(frag);
+ if (total + flen < total) {
+ *overflow = true;
+ return 0;
}
+ total += flen;
+ }
+
+ return total;
+}
+
+/*
+ * Process a delivered slot at lwe: latch FIN if acceptable,
+ * advance byte_high (clamped to byte_fin once latched).
+ */
+static __inline__ void stream_deliver_slot(struct frcti * frcti,
+ size_t lp)
+{
+ uint32_t end;
+
+ end = frcti->rcv_slots[lp].end;
+
+ if (frcti->rcv_slots[lp].fin) {
+ if (end == frcti->rcv_byte_high && !frcti->rcv_fin_seen) {
+ frcti->rcv_fin_seen = true;
+ frcti->rcv_byte_fin = end;
+ } else {
+ STAT_BUMP(frcti, strm_fin_drop);
+ }
+ }
+
+ if (frcti->rcv_fin_seen && after(end, frcti->rcv_byte_fin))
+ end = frcti->rcv_byte_fin;
+
+ frcti->rcv_byte_high = end;
+}
+
+/* Two-segment memcpy from buf into the rx ring at byte offset start. */
+static void stream_ring_write(struct frcti * frcti,
+ uint32_t start,
+ buffer_t buf)
+{
+ size_t mask = frcti->rcv_ring_sz - 1;
+ size_t off = start & mask;
+
+ if (off + buf.len <= frcti->rcv_ring_sz) {
+ memcpy(frcti->rcv_ring + off, buf.data, buf.len);
+ } else {
+ size_t first = frcti->rcv_ring_sz - off;
+ memcpy(frcti->rcv_ring + off, buf.data, first);
+ memcpy(frcti->rcv_ring, buf.data + first, buf.len - first);
+ }
+}
- pthread_mutex_unlock(&frcti->mtx);
- pthread_rwlock_rdlock(&frcti->lock);
+/* Two-segment memcpy from the rx ring at byte offset start into buf. */
+static void stream_ring_read(struct frcti * frcti,
+ uint32_t start,
+ buffer_t buf)
+{
+ size_t mask = frcti->rcv_ring_sz - 1;
+ size_t off = start & mask;
+
+ if (off + buf.len <= frcti->rcv_ring_sz) {
+ memcpy(buf.data, frcti->rcv_ring + off, buf.len);
+ } else {
+ size_t first = frcti->rcv_ring_sz - off;
+ memcpy(buf.data, frcti->rcv_ring + off, first);
+ memcpy(buf.data + first, frcti->rcv_ring, buf.len - first);
}
+}
+
+/* Deliver-or-drop one stashed slot at lwe; advance lwe/rwe. Caller wrlock. */
+static void stream_advance_lwe(struct frcti * frcti)
+{
+ size_t lp;
+
+ lp = RQ_SLOT(frcti->rcv_cr.lwe);
+
+ if (frcti->rcv_slots[lp].start != frcti->rcv_byte_high)
+ STAT_BUMP(frcti, strm_drop);
+ else
+ stream_deliver_slot(frcti, lp);
+
+ frcti->rcv_slots[lp].fin = 0;
+ frcti->rcv_slots[lp].idx = -1;
+ STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1);
+ frcti->rcv_cr.rwe++;
+}
+
+/*
+ * Validate a stream DATA packet before stashing. Returns 0 if the
+ * packet may be written into rcv_ring + rq[], -1 otherwise.
+ */
+static __inline__ int stream_stash_check(struct frcti * frcti,
+ uint32_t start,
+ uint32_t end,
+ size_t plen,
+ uint16_t flags)
+{
+ if (end - start != (uint32_t) plen)
+ return -1;
+ /* FIN MUST be 0-byte. */
+ if ((flags & FRCT_FIN) && plen != 0)
+ return -1;
+
+ /* Post-EOS: no further FIN once latched. */
+ if (frcti->rcv_fin_seen && (flags & FRCT_FIN))
+ return -1;
+
+ /* Post-EOS: reject data at or past byte_fin. */
+ if (frcti->rcv_fin_seen && !before(start, frcti->rcv_byte_fin))
+ return -1;
+
+ /* Stale: peer is behind the delivered edge. */
+ if (before(end, frcti->rcv_byte_next))
+ return -1;
+
+ /* Exact-edge: only an empty-stream FIN is meaningful. */
+ if (end == frcti->rcv_byte_next && !(flags & FRCT_FIN))
+ return -1;
+
+ if (end - frcti->rcv_byte_next > frcti->rcv_ring_sz)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Stream-mode DATA receive: validate, stash payload in rcv_ring, mark
+ * rq[pos], advance lwe through any newly-contiguous run. Returns 0
+ * (spb released) or -1 (caller releases). Caller wrlock.
+ */
+static int frcti_stream_data_rcv(struct frcti * frcti,
+ struct ssm_pk_buff * spb,
+ size_t pos,
+ uint16_t flags)
+{
+ struct frct_pci_stream * spci;
+ uint32_t start;
+ uint32_t end;
+ buffer_t buf;
+ size_t skip;
+
+ if (ssm_pk_buff_len(spb) < FRCT_PCI_STREAM_LEN)
+ return -1;
+
+ if (frcti->rcv_ring == NULL) {
+ frcti->rcv_ring = calloc(1, frcti->rcv_ring_sz);
+ if (frcti->rcv_ring == NULL)
+ return -ENOMEM;
+ }
+
+ spci = FRCT_HDR_POP(spb, frct_pci_stream);
+ start = ntoh32(spci->start);
+ end = ntoh32(spci->end);
+
+ buf.data = ssm_pk_buff_head(spb);
+ buf.len = ssm_pk_buff_len(spb);
+
+ if (stream_stash_check(frcti, start, end, buf.len, flags) < 0)
+ return -1;
+
+ /* Trim front-overlap with already-delivered region. */
+ if (before(start, frcti->rcv_byte_next)) {
+ skip = frcti->rcv_byte_next - start;
+ buf.data += skip;
+ buf.len -= skip;
+ start = frcti->rcv_byte_next;
+ }
+
+ stream_ring_write(frcti, start, buf);
+ STAT_ADD(frcti, strm_rcv_byte, buf.len);
+
+ frcti->rcv_slots[pos].idx = 1;
+ frcti->rcv_slots[pos].start = start;
+ frcti->rcv_slots[pos].end = end;
+ frcti->rcv_slots[pos].fin = (flags & FRCT_FIN) ? 1 : 0;
+
+ while (frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1)
+ stream_advance_lwe(frcti);
+
+ frct_spb_release(spb);
+
+ return 0;
+}
+
+/*
+ * DATA receive: stash idx at rq[pos], advance lwe through any
+ * contiguous run. Caller wrlock.
+ */
+static void frcti_data_stash(struct frcti * frcti,
+ ssize_t idx,
+ size_t pos,
+ uint16_t flags)
+{
+ frcti->rcv_slots[pos].idx = idx;
+
+ if ((flags & FRCT_FR_MASK) != FRCT_FR_SOLE)
+ STAT_BUMP(frcti, frag_rcv);
+
+ /* lwe = cum-ACK edge; advance per fragment through contiguous run. */
+ while (before(frcti->rcv_cr.lwe, frcti->rcv_cr.rwe)
+ && frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1)
+ STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1);
+}
+
+/* Stream consume: copy up to `count` contiguous bytes from ring into buf. */
+static ssize_t frcti_consume_stream(struct frcti * frcti,
+ uint8_t * buf,
+ size_t count)
+{
+ size_t avail;
+ size_t copy;
+ ssize_t ret;
+ buffer_t dst;
+
+ assert(frcti);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ avail = (size_t) (frcti->rcv_byte_high - frcti->rcv_byte_next);
+ if (avail == 0) {
+ /* EOS drained: signal EOF to the reader. */
+ if (frcti->rcv_fin_seen
+ && frcti->rcv_byte_next == frcti->rcv_byte_fin)
+ ret = 0;
+ else
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ copy = MIN(avail, count);
+
+ dst.data = buf;
+ dst.len = copy;
+ stream_ring_read(frcti, frcti->rcv_byte_next, dst);
+
+ frcti->rcv_byte_next += (uint32_t) copy;
+ STAT_ADD(frcti, strm_dlv_byte, copy);
+
+ ret = (ssize_t) copy;
+
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
return ret;
}
-static ssize_t __frcti_queued_pdu(struct frcti * frcti)
+/*
+ * FRTX consume: copy next ready PDU (full SDU or nothing). Returns bytes,
+ * -EAGAIN (no PDU), or -EMSGSIZE (oversize: run dropped to unblock flow).
+ */
+static ssize_t frcti_consume(struct frcti * frcti,
+ uint8_t * buf,
+ size_t count)
{
- ssize_t idx;
- size_t pos;
+ size_t n;
+ size_t total;
+ bool overflow;
+ enum frag_state st;
+ ssize_t ret;
assert(frcti);
- /* See if we already have the next PDU. */
pthread_rwlock_wrlock(&frcti->lock);
- pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1);
-
- idx = frcti->rq[pos];
- if (idx != -1) {
- ++frcti->rcv_cr.lwe;
- ++frcti->rcv_cr.rwe;
- frcti->rq[pos] = -1;
+ while (true) {
+ st = frag_run_inspect(frcti, &n);
+ if (st == FRAG_NOT_READY) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ if (st == FRAG_DROP) {
+ STAT_ADD(frcti, frag_drop, n);
+ frag_drop(frcti, n);
+ continue;
+ }
+ /* FRAG_DELIVER */
+ total = frag_total_len(frcti, n, &overflow);
+ if (overflow || total > frcti->max_rcv_sdu || total > count) {
+ STAT_ADD(frcti, frag_drop, n);
+ frag_drop(frcti, n);
+ ret = -EMSGSIZE;
+ goto unlock;
+ }
+ ret = (ssize_t) frag_gather(frcti, n, buf);
+ if (n > 1)
+ STAT_BUMP(frcti, sdu_reasm);
+ else
+ STAT_BUMP(frcti, sdu_sole);
+ goto unlock;
}
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
- return idx;
+ return ret;
}
-static ssize_t __frcti_pdu_ready(struct frcti * frcti)
+static bool frcti_pdu_ready(struct frcti * frcti)
{
- ssize_t idx;
- size_t pos;
+ size_t pos;
+ size_t count;
+ bool ready;
assert(frcti);
- /* See if we already have the next PDU. */
pthread_rwlock_rdlock(&frcti->lock);
- pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1);
- idx = frcti->rq[pos];
+ if (frcti->stream) {
+ ready = frcti->rcv_byte_high != frcti->rcv_byte_next;
+ pthread_rwlock_unlock(&frcti->lock);
+ return ready;
+ }
+
+ if (frag_run_inspect(frcti, &count) != FRAG_DELIVER) {
+ /* Drop case: frcti_consume will handle it; not ready. */
+ pthread_rwlock_unlock(&frcti->lock);
+ return false;
+ }
+
+ pos = RQ_SLOT(frcti->rcv_cr.rwe - RQ_SIZE);
+ ready = frcti->rcv_slots[pos].idx != -1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ready;
+}
+
+/* No srtt yet: probe at the cold-probe cadence to seed it. */
+#define PROBE_DUE_COLD(frcti, now_ns) \
+ ((now_ns) - (frcti)->t_snd_probe > (uint64_t) RTTP_COLD_NS)
+
+/* Have srtt: probe when peer quiet for > 2*srtt and last probe > srtt. */
+#define PROBE_DUE_WARM(frcti, now_ns) \
+ ((now_ns) - (frcti)->t_rcv_rtt > 2u * (uint64_t)(frcti)->srtt \
+ && (now_ns) - (frcti)->t_snd_probe > (uint64_t)(frcti)->srtt)
+
+/* Seeds srtt for receive-only sides so they don't fall back to 1 s RTO. */
+__attribute__((cold))
+static void frcti_rcv_probe(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ uint32_t probe_id;
+ uint8_t nonce[RTTP_NONCE_LEN] = { 0 };
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (frcti->srtt == 0 && !PROBE_DUE_COLD(frcti, now_ns)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ if (frcti->srtt != 0 && !PROBE_DUE_WARM(frcti, now_ns)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ probe_id = rttp_alloc_probe(frcti, now_ns, nonce);
pthread_rwlock_unlock(&frcti->lock);
- return idx;
+ if (probe_id != 0)
+ frcti_rttp_snd(frcti, probe_id, 0, nonce);
}
-#include <timerwheel.c>
+/* Echo at slot `pos` matches our probe: id, slot, nonce all intact. */
+static __inline__ bool probe_echo_matches(struct frcti * frcti,
+ size_t pos,
+ uint32_t echo_id,
+ const uint8_t nonce[RTTP_NONCE_LEN])
+{
+ if (frcti->probes[pos].id != echo_id)
+ return false;
+
+ if (frcti->probes[pos].ts == 0)
+ return false;
+
+ return memcmp(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN) == 0;
+}
/*
- * Send a final ACK for everything that has not been ACK'd.
- * If the flow should be kept active for retransmission,
- * the returned time will be negative.
+ * RTT probe (echo_id == 0): bounce the nonce back to peer.
+ * RTT echo (echo_id != 0): verify nonce + feed sample.
*/
-static time_t __frcti_dealloc(struct frcti * frcti)
+static void frcti_rttp_rcv(struct frcti * frcti,
+ buffer_t pkt,
+ uint64_t now_ns)
{
- struct timespec now;
- time_t wait;
- int ackno;
- int fd = -1;
+ const struct frct_rttp * rttp;
+ uint32_t probe_id;
+ uint32_t echo_id;
+ uint8_t nonce[RTTP_NONCE_LEN];
+ size_t ring_pos;
+ int64_t elapsed;
+ uint64_t sample;
+
+ if (pkt.len < RTTP_PAYLOAD)
+ return;
+
+ rttp = (const struct frct_rttp *) pkt.data;
+ probe_id = ntoh32(rttp->probe_id);
+ echo_id = ntoh32(rttp->echo_id);
+
+ /* Forged/malformed: bouncing this would loop on echo_id == 0. */
+ if (probe_id == 0 && echo_id == 0)
+ return;
+
+ memcpy(nonce, rttp->nonce, sizeof(nonce));
+
+ if (echo_id == 0) {
+ /* Probe: echo back with same nonce so peer can verify. */
+ STAT_BUMP(frcti, rttp_rcv);
+ frcti_rttp_snd(frcti, 0, probe_id, nonce);
+ return;
+ }
+
+ ring_pos = RTTP_POS(echo_id);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (!probe_echo_matches(frcti, ring_pos, echo_id, nonce)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ elapsed = ts_age_ns(now_ns, frcti->probes[ring_pos].ts);
+ frcti->probes[ring_pos].ts = 0;
+ frcti->t_rcv_rtt = now_ns;
+
+ if (elapsed <= 0) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+ sample = (uint64_t) elapsed;
+
+ /* Clamp probe sample to RTT_CLAMP_MUL * srtt to avoid poisoning. */
+ if (frcti->srtt > 0)
+ sample = MIN(sample, (uint64_t) frcti->srtt * RTT_CLAMP_MUL);
+
+ rtt_update(frcti, sample, now_ns);
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+/* Honours piggybacked ACK on the KA. */
+static void frcti_ka_rcv(struct frcti * frcti,
+ const struct frct_pci * pci,
+ uint64_t now_ns,
+ uint16_t flags)
+{
+ uint32_t ka_ackno;
+
+ STORE_RELEASE(&frcti->t_ka_rcv, now_ns);
+ STAT_BUMP(frcti, ka_rcv);
+
+ if (!(flags & FRCT_ACK))
+ return;
+
+ ka_ackno = ntoh32(pci->ackno);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (within(ka_ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno))
+ STORE_RELEASE(&frcti->snd_cr.lwe, ka_ackno);
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+/*
+ * Additive HoL re-emit (carries DRF); runs before rcv_cr->act
+ * refresh so it doesn't pre-empt peer's first DRF.
+ */
+__attribute__((cold))
+static void frcti_nack_rcv(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ size_t hp;
+ struct rxm_entry * rxm;
+ void * pkt_copy = NULL;
+ size_t pkt_len = 0;
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ STAT_BUMP(frcti, nack_rcv);
+
+ if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm);
+ if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ pkt_copy = malloc(rxm->len);
+ if (pkt_copy != NULL) {
+ memcpy(pkt_copy, rxm->pkt, rxm->len);
+ pkt_len = rxm->len;
+ /* Karn: suppress RTT sample. NACK supersedes pending TLP. */
+ frcti->snd_slots[hp].flags =
+ (frcti->snd_slots[hp].flags & ~SND_TLP)
+ | SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ STAT_BUMP(frcti, rxm_nack);
+ }
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ if (pkt_copy != NULL) {
+ int ret = fast_rxm_send(frcti, pkt_copy, pkt_len);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+ free(pkt_copy);
+ }
+}
+
+__attribute__((cold))
+static void frcti_rdv_rcv(struct frcti * frcti)
+{
+ uint32_t rwe;
pthread_rwlock_rdlock(&frcti->lock);
- ackno = frcti->rcv_cr.lwe;
- if (frcti->rcv_cr.lwe != frcti->rcv_cr.seqno)
- fd = frcti->fd;
+ rwe = frcti_advert_rwe(frcti);
- wait = MAX(frcti->rcv_cr.inact - now.tv_sec + frcti->rcv_cr.act.tv_sec,
- frcti->snd_cr.inact - now.tv_sec + frcti->snd_cr.act.tv_sec);
- wait = MAX(wait, 0);
+ pthread_rwlock_unlock(&frcti->lock);
+
+ STAT_BUMP(frcti, rdv_rcv);
- if (frcti->snd_cr.cflags & FRCTFLINGER
- && before(frcti->snd_cr.lwe, frcti->snd_cr.seqno))
- wait = -wait;
+ frcti_pkt_snd(frcti, FRCT_FC, 0, rwe);
+}
+
+/* §7.2: PTO = 2*SRTT + max delayed-ACK delay; fallback when unseeded. */
+static __inline__ uint64_t tlp_pto(const struct frcti * frcti)
+{
+ if (frcti->srtt > 0)
+ return 2ULL * (uint64_t) frcti->srtt + ACK_DELAY_NS;
+ return NACK_COOLDOWN_NS;
+}
+
+/*
+ * RFC 8985 §7: lazy probe. Re-evaluate on fire — if sender was active
+ * within PTO, re-post; else probe HoL once and hand off to RTO.
+ */
+__attribute__((cold))
+static void tlp_due(void * arg)
+{
+ struct frcti * frcti = arg;
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t pto;
+ uint64_t rto_at;
+ size_t hp;
+ struct rxm_entry * rxm;
+ void * pkt_copy = NULL;
+ size_t pkt_len = 0;
+ bool re_post = false;
+ uint64_t deadline = 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (frcti->snd_cr.seqno == frcti->snd_cr.lwe)
+ goto unlock;
+
+ if (!before(frcti->snd_cr.seqno, frcti->snd_cr.rwe))
+ goto unlock; /* FC-blocked: RDV handles it. */
+
+ /* RFC 8985 §7.3: one outstanding probe, MAX_TLP_PER_EP per ep. */
+ if (frcti->tlp_high_seq != 0)
+ goto unlock;
+
+ if (frcti->tlp_count >= MAX_TLP_PER_EP)
+ goto unlock;
+
+ pto = tlp_pto(frcti);
+
+ /* §7.2: anchor PTO on most recent send; defer if still active. */
+ if (now_ns < frcti->snd_cr.act + pto) {
+ deadline = frcti->snd_cr.act + pto;
+ re_post = true;
+ goto unlock;
+ }
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm);
+ if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ goto unlock;
+
+ /* Cap: if HoL RTO is due, let rxm_due fire instead. */
+ rto_at = rxm->t0 + ((uint64_t) frcti->rto
+ << LOAD_RELAXED(&frcti->rto_mul));
+ if (rto_at <= now_ns)
+ goto unlock;
+
+ pkt_copy = malloc(rxm->len);
+ if (pkt_copy != NULL) {
+ memcpy(pkt_copy, rxm->pkt, rxm->len);
+ pkt_len = rxm->len;
+ frcti->snd_slots[hp].time = now_ns;
+ frcti->snd_slots[hp].flags |= SND_TLP | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ /* §7.3 outstanding-probe marker; ack_rcv/rxm_snd clear. */
+ frcti->tlp_high_seq = frcti->snd_cr.seqno;
+ frcti->tlp_count++;
+ STAT_BUMP(frcti, tlp_snd);
+ }
+
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
- if (fd != -1)
- __send_frct_pkt(fd, FRCT_ACK, ackno, 0);
+ if (pkt_copy != NULL) {
+ fast_rxm_send(frcti, pkt_copy, pkt_len);
+ free(pkt_copy);
+ }
+
+ if (re_post)
+ tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti);
+ else
+ __atomic_clear(&frcti->tlp_pending, __ATOMIC_RELAXED);
+}
+
+/* §7.2 lazy: post once per quiet period. tlp_due re-evaluates on fire. */
+static int tlp_arm(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t pto;
+ uint64_t deadline;
+
+ /* §7.3: one outstanding probe, MAX_TLP_PER_EP per recovery ep. */
+ if (LOAD_RELAXED(&frcti->tlp_high_seq) != 0)
+ return 0;
+ if (LOAD_RELAXED(&frcti->tlp_count) >= MAX_TLP_PER_EP)
+ return 0;
+ if (__atomic_test_and_set(&frcti->tlp_pending, __ATOMIC_RELAXED))
+ return 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pto = tlp_pto(frcti);
+
+ deadline = LOAD_RELAXED(&frcti->snd_cr.act) + pto;
+ if (deadline <= now_ns)
+ deadline = now_ns + pto;
+
+ tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti);
+
+ return 0;
+}
+
+/*
+ * FC window advert from any flag-bearing packet. Caps at lwe + RQ_SIZE,
+ * rejects backward shrink (forged/stale FC), marks window open.
+ * Caller wrlock.
+ */
+static __inline__ void frcti_fc_rcv(struct frcti * frcti,
+ const struct frct_pci * pci)
+{
+ struct frct_cr * snd_cr;
+ uint32_t rwe;
+ uint32_t rwe_max;
+
+ snd_cr = &frcti->snd_cr;
+ rwe = ntoh32(pci->window);
+ rwe_max = snd_cr->lwe + RQ_SIZE;
+
+ if (after(rwe, rwe_max))
+ rwe = rwe_max;
+
+ /* Reject backward shrink (forged/stale FC). */
+ if (before(rwe, snd_cr->rwe))
+ rwe = snd_cr->rwe;
+
+ STORE_RELAXED(&snd_cr->rwe, rwe);
+ frcti->open = true;
+}
+
+/* Packet copies captured under frcti->lock; emitted after release. */
+struct pending {
+ buffer_t fast_rxm;
+ buffer_t sack_rxm[SACK_RXM_MAX];
+ size_t sack_rxm_cnt;
+};
+
+/* RFC 6582 §3.2: seal recovery_high on entry; do not extend on new gaps. */
+static void recovery_enter(struct frcti * frcti)
+{
+ if (frcti->in_recovery)
+ return;
+
+ frcti->in_recovery = true;
+ frcti->recovery_high = frcti->snd_cr.seqno + RTT_QUARANTINE;
+}
+
+/* True when cum-ACK clears recovery_high or all in-flight ACKed. */
+static bool recovery_exit_reached(struct frcti * frcti,
+ uint32_t ackno)
+{
+ if (!frcti->in_recovery)
+ return false;
+
+ if (!before(ackno, frcti->recovery_high))
+ return true;
+
+ return ackno == frcti->snd_cr.seqno;
+}
+
+/* RTT sample gate: Karn + SACK-consume + don't-seed. */
+static bool rtt_sample_eligible(struct frcti * frcti,
+ size_t p,
+ uint16_t flags,
+ uint32_t lwe)
+{
+ if (flags & FRCT_RXM)
+ return false;
+ if (frcti->snd_slots[p].flags & (SND_RTX | SND_TLP))
+ return false;
+ if (LOAD_ACQUIRE(&frcti->snd_slots[p].rxm) == NULL)
+ return false;
+ if (before(lwe, frcti->rtt_lwe))
+ return false;
+ /* Don't seed srtt from a cum-ACK; let probes seed. */
+ if (frcti->srtt == 0)
+ return false;
+ return true;
+}
+
+#define RXM_SLOT_EMPTY(rxm) ((rxm) == NULL)
+#define FAST_RXM_STAGED(pending) ((pending)->fast_rxm.data != NULL)
+#define RXM_FAST_DONE(flags) (((flags) & SND_FAST_RXM) != 0)
+
+/* RACK fast retransmit on cum-ACK: HoL aged past R, not yet retransmitted. */
+static void fast_rxm_consider(struct frcti * frcti,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ struct rxm_entry * rxm;
+ struct snd_slot * slot;
+ size_t hp;
+ uint64_t R;
+ bool rack_ok;
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ slot = &frcti->snd_slots[hp];
+ rxm = LOAD_ACQUIRE(&slot->rxm);
+ R = rack_reorder_window(frcti);
+
+ if (RXM_SLOT_EMPTY(rxm))
+ return;
+
+ /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */
+ rack_ok = (int64_t)(frcti->t_latest_ack - slot->time) > (int64_t) R;
+ if (!rack_ok && frcti->dup_thresh < DUP_THRESH)
+ return;
+
+ /* HoL aged past t_r; let rxm_due tear the flow down. */
+ if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ return;
+
+ /* Already on it. */
+ if (FAST_RXM_STAGED(pending) || RXM_FAST_DONE(slot->flags))
+ return;
+
+ recovery_enter(frcti);
+
+ pending->fast_rxm.data = malloc(rxm->len);
+ if (pending->fast_rxm.data == NULL)
+ return;
+
+ pending->fast_rxm.len = rxm->len;
+ memcpy(pending->fast_rxm.data, rxm->pkt, rxm->len);
+ slot->flags |= SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ if (rack_ok)
+ STAT_BUMP(frcti, rxm_rack);
+ else
+ STAT_BUMP(frcti, rxm_dupthresh);
+}
+
+/* Caller holds wrlock; RACK fast retransmit queued in pending. */
+__attribute__((hot))
+static void frcti_ack_rcv(struct frcti * frcti,
+ const struct frct_pci * pci,
+ uint16_t flags,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint32_t ackno;
+ uint32_t lwe;
+ size_t p;
+ size_t fresh;
+
+ if (!(flags & FRCT_DATA))
+ STAT_BUMP(frcti, ack_rcv);
+
+ ackno = ntoh32(pci->ackno);
+ if (ackno == frcti->snd_cr.lwe) {
+ /* RFC 8985 §6.2: only on scoreboard change. */
+ if (frcti->snd_cr.lwe != frcti->rack_fired_lwe) {
+ fast_rxm_consider(frcti, now_ns, pending);
+ frcti->rack_fired_lwe = frcti->snd_cr.lwe;
+ }
+ return;
+ }
+
+ if (!within(ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno))
+ return;
+
+ lwe = frcti->snd_cr.lwe;
+ p = RQ_SLOT(lwe);
+
+ STORE_RELEASE(&frcti->snd_cr.lwe, ackno);
+
+ /* §7.3: cum-ACK past the probed seqno resolves the TLP. */
+ if (frcti->tlp_high_seq != 0
+ && !before(ackno, frcti->tlp_high_seq))
+ frcti->tlp_high_seq = 0;
+
+ /* §7.3: end the probe episode once inflight drains. */
+ if (ackno == frcti->snd_cr.seqno)
+ frcti->tlp_count = 0;
+
+ /* RFC 8985 §7.2: halve mult per REO_DECAY_PKTS fresh-ACK'd seqnos. */
+ fresh = ackno - frcti->dsack_lwe_snap;
+ if (frcti->reo_wnd_mult > 1 && fresh >= REO_DECAY_PKTS) {
+ uint8_t half = frcti->reo_wnd_mult >> 1;
+ frcti->reo_wnd_mult = half < 1 ? 1 : half;
+ frcti->dsack_lwe_snap = ackno;
+ }
+
+ /* RFC 8985: latest cum-ACKed send-time (slot of ackno-1). */
+ frcti->t_latest_ack = frcti->snd_slots[RQ_SLOT(ackno - 1)].time;
+
+ /* RFC 8985: SACK-above-lwe count is per-recovery-episode. */
+ frcti->dup_thresh = 0;
+
+ /* Karn-skip on retx; TLP ACK clears rto_mul (no CC backoff). */
+ if ((frcti->snd_slots[p].flags & SND_RTX) == 0
+ || (frcti->snd_slots[p].flags & SND_TLP) != 0)
+ STORE_RELEASE(&frcti->rto_mul, 0);
- return wait;
+ if (recovery_exit_reached(frcti, ackno))
+ frcti->in_recovery = false;
+
+ if (rtt_sample_eligible(frcti, p, flags, lwe)) {
+ int64_t mrtt = ts_age_ns(now_ns, frcti->snd_slots[p].time);
+ if (mrtt > 0) {
+ if (!(flags & FRCT_DATA))
+ STAT_BUMP(frcti, ack_rtt);
+ rtt_update(frcti, (time_t) mrtt, now_ns);
+ frcti->t_rcv_rtt = now_ns;
+ }
+ }
}
-static int __frcti_snd(struct frcti * frcti,
- struct ssm_pk_buff * spb)
+/* Skip k == lwe under clamp: NULLing HoL from a stale SACK wedges it. */
+static uint32_t sack_mark_blocks(struct frcti * frcti,
+ const uint8_t * payload,
+ uint16_t n,
+ uint32_t * newly_marked)
{
- struct frct_pci * pci;
- struct timespec now;
- struct frct_cr * snd_cr;
- struct frct_cr * rcv_cr;
- uint32_t seqno;
- bool rtx;
+ uint32_t hi_sacked = frcti->snd_cr.lwe;
+ uint32_t marked = 0;
+ uint16_t i;
+
+ for (i = 0; i < n; ++i) {
+ uint32_t s;
+ uint32_t e;
+ uint32_t k;
+ bool clamped;
+
+ sack_block_get(payload, i, &s, &e);
+
+ if (!before(s, e))
+ continue;
+
+ clamped = before(s, frcti->snd_cr.lwe);
+ if (clamped)
+ s = frcti->snd_cr.lwe;
+ if (after(e, frcti->snd_cr.seqno))
+ e = frcti->snd_cr.seqno;
+
+ for (k = s; before(k, e); ++k) {
+ size_t kp = RQ_SLOT(k);
+ uint64_t t_k;
+ if (clamped && k == frcti->snd_cr.lwe)
+ continue;
+ if (LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm) == NULL)
+ continue;
+ STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL);
+ frcti->snd_slots[kp].flags = 0;
+ marked++;
+ /* RACK.fack: latest SACK-confirmed send-time. */
+ t_k = frcti->snd_slots[kp].time;
+ if (t_k > frcti->t_latest_ack)
+ frcti->t_latest_ack = t_k;
+ }
+
+ if (after(e, hi_sacked))
+ hi_sacked = e;
+ }
+
+ *newly_marked = marked;
+ return hi_sacked;
+}
+
+/* Queue once per loss event (SND_FAST_RXM gates). Emit after unlock. */
+static void sack_queue_rxm(struct frcti * frcti,
+ uint32_t hi_sacked,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint64_t R = rack_reorder_window(frcti);
+ uint32_t k;
+ bool rack_ok;
+
+ for (k = frcti->snd_cr.lwe; before(k, hi_sacked); ++k) {
+ struct rxm_entry * rxm;
+ size_t kp = RQ_SLOT(k);
+ size_t cnt = pending->sack_rxm_cnt;
+ size_t rack_age;
+
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm);
+
+ if (cnt >= SACK_RXM_MAX)
+ break;
+
+ if (rxm == NULL)
+ continue;
+
+ if (frcti->snd_slots[kp].flags & SND_FAST_RXM)
+ continue;
+
+ if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ continue;
+
+ rack_age = frcti->t_latest_ack - frcti->snd_slots[kp].time;
+ /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */
+ rack_ok = (int64_t) rack_age > (int64_t) R;
+ if (!rack_ok && frcti->dup_thresh < DUP_THRESH)
+ continue;
+
+ if (rack_ok)
+ STAT_BUMP(frcti, rxm_rack);
+ else
+ STAT_BUMP(frcti, rxm_dupthresh);
+
+ pending->sack_rxm[cnt].data = malloc(rxm->len);
+ if (pending->sack_rxm[cnt].data == NULL)
+ break;
+
+ pending->sack_rxm[cnt].len = rxm->len;
+ memcpy(pending->sack_rxm[cnt].data, rxm->pkt, rxm->len);
+ pending->sack_rxm_cnt++;
+ /* NULL slot so the original timer self-cleans. */
+ STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL);
+ frcti->snd_slots[kp].time = now_ns;
+ frcti->snd_slots[kp].flags |= SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = k + 1;
+ }
+}
+
+/*
+ * RFC 2883 D-SACK detector. Returns true iff block[0] is a D-SACK
+ * report:
+ * case 1: blocks[0].start < pkt_ackno (strictly below cum-ACK).
+ * case 2: blocks[0] is a strict sub-range of some blocks[i>0].
+ * MAX_DSACK_LAG bounds case-1 distance to one rcv window (sanity).
+ */
+static bool sack_is_dsack(struct frcti * frcti,
+ const uint8_t * payload,
+ uint16_t n,
+ uint32_t pkt_ackno)
+{
+ uint32_t s0;
+ uint32_t e0;
+ uint16_t i;
+
+ if (n == 0)
+ return false;
+
+ sack_block_get(payload, 0, &s0, &e0);
+ if (!before(s0, e0))
+ return false;
+
+ if (before(s0, pkt_ackno)) {
+ if ((pkt_ackno - s0) <= (uint32_t) MAX_DSACK_LAG)
+ return true;
+ STAT_BUMP(frcti, dsack_drop);
+ return false;
+ }
+
+ for (i = 1; i < n; ++i) {
+ uint32_t si;
+ uint32_t ei;
+
+ sack_block_get(payload, i, &si, &ei);
+ if (!before(si, ei))
+ continue;
+ if (!before(s0, si) && !after(e0, ei)
+ && (s0 != si || e0 != ei))
+ return true;
+ }
+
+ return false;
+}
+
+/* RFC 8985 §7.2: grow reo_wnd_mult on DSACK; at most once per RTT. */
+static __inline__ void reo_wnd_on_dsack(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ time_t srtt = frcti->srtt;
+
+ /* Snap is unconditional: feeds the per-D-SACK decay clock. */
+ frcti->dsack_lwe_snap = frcti->snd_cr.lwe;
+
+ if (srtt > 0
+ && now_ns - frcti->t_last_reo_widen <= (uint64_t) srtt)
+ return;
+
+ if (frcti->reo_wnd_mult < REO_WND_MULT_MAX)
+ frcti->reo_wnd_mult++;
+
+ frcti->t_last_reo_widen = now_ns;
+}
+
+/* Caller holds wrlock; retransmits queued for post-unlock emission. */
+static void frcti_sack_rcv(struct frcti * frcti,
+ buffer_t pkt,
+ uint32_t pkt_ackno,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint32_t hi_sacked;
+ uint32_t marked;
+ uint16_t n;
+ bool dsack;
+ uint16_t n_real;
+
+ if (pkt.len < SACK_HDR_SIZE)
+ return;
+
+ n = ntoh16(*(const uint16_t *) pkt.data);
+ if (n > SACK_MAX_BLOCKS)
+ return;
+
+ if (pkt.len < SACK_HDR_SIZE + (size_t) n * SACK_BLOCK_SIZE)
+ return;
+
+ STAT_BUMP(frcti, sack_rcv);
+
+ dsack = sack_is_dsack(frcti, pkt.data, n, pkt_ackno);
+ n_real = n - (dsack ? 1 : 0);
+
+ if (dsack) {
+ STAT_BUMP(frcti, dsack_rcv);
+ reo_wnd_on_dsack(frcti, now_ns);
+ }
+
+ /* DSACK-only carries no new gap; don't enter recovery. */
+ if (n_real > 0)
+ recovery_enter(frcti);
+
+ marked = 0;
+ hi_sacked = sack_mark_blocks(frcti, pkt.data, n, &marked);
+ frcti->dup_thresh += marked;
+
+ if (after(hi_sacked, frcti->snd_cr.lwe))
+ sack_queue_rxm(frcti, hi_sacked, now_ns, pending);
+}
+
+/* Emit and free queued packet copies. */
+static void pending_flush(struct frcti * frcti,
+ struct pending * pending)
+{
+ size_t i;
+
+ for (i = 0; i < pending->sack_rxm_cnt; ++i) {
+ sack_rxm_snd(frcti, pending->sack_rxm[i].data,
+ pending->sack_rxm[i].len);
+ free(pending->sack_rxm[i].data);
+ }
+
+ if (pending->fast_rxm.data != NULL) {
+ int ret = fast_rxm_send(frcti, pending->fast_rxm.data,
+ pending->fast_rxm.len);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+ free(pending->fast_rxm.data);
+ }
+}
+
+/* Pre-DRF NACK: ask peer to retransmit HoL; seqno is informational. */
+static void frcti_nack_snd(struct frcti * frcti,
+ uint32_t seqno_unseen)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_NACK);
+ pci->seqno = hton32(seqno_unseen);
+
+ frct_hcs_set(pci, false);
+
+ frct_tx(frcti, spb);
+}
+
+enum frct_act {
+ FRCT_ACTIVE,
+ FRCT_INACT_NEED_NACK,
+ FRCT_INACT_DROP,
+};
+
+/* On rcv inactivity: rebase on DRF, or arm pre-DRF NACK. Caller wrlock. */
+static enum frct_act rcv_inact_check(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t seqno,
+ uint64_t now_ns)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+ uint64_t cd;
+
+ if (!ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact))
+ return FRCT_ACTIVE;
+
+ if (flags & FRCT_DRF) {
+ if (same_epoch_drf(seqno, flags, rcv_cr))
+ return FRCT_ACTIVE;
+
+ /* Bootstrap or fresh epoch: rebase. */
+ STAT_BUMP(frcti, drf_rebase);
+ release_rq(frcti);
+ STORE_RELEASE(&rcv_cr->lwe, seqno);
+ rcv_cr->rwe = seqno + RQ_SIZE;
+ rcv_cr->seqno = seqno;
+ return FRCT_ACTIVE;
+ }
+
+ if (!(flags & FRCT_DATA))
+ return FRCT_ACTIVE;
+
+ /* Pre-DRF: nudge sender with NACK (rate-limited). */
+ cd = frcti->srtt > 0 ? (uint64_t) frcti->srtt : NACK_COOLDOWN_NS;
+ if (!ts_aged_ns(now_ns, frcti->t_nack, cd))
+ return FRCT_INACT_DROP;
+
+ frcti->t_nack = now_ns;
+ STAT_BUMP(frcti, nack_snd);
+
+ return FRCT_INACT_NEED_NACK;
+}
+
+/* Both modes: bounded accept into rq[seqno]. Caller wrlock. */
+__attribute__((hot))
+static bool rq_accept(struct frcti * frcti,
+ uint32_t seqno,
+ size_t pos,
+ uint16_t flags)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+
+ if (!before(seqno, rcv_cr->rwe)) {
+ STAT_BUMP(frcti, out_rcv);
+ return false;
+ }
+
+ if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) {
+ STAT_BUMP(frcti, rqo_rcv);
+ return false;
+ }
+
+ if (frcti->rcv_slots[pos].idx != -1) {
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_dup_rcv);
+ else
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 2: in-window dup; sub-range marker. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
+ return false;
+ }
+
+ return true;
+}
+
+/* OOO arrival; throttle by min_gap + scoreboard dedup. */
+static bool sack_check(struct frcti * frcti,
+ uint32_t seqno,
+ uint64_t now_ns,
+ struct sack_args * out)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+ uint64_t min_gap;
+ uint16_t n;
+
+ if (!after(seqno, rcv_cr->lwe))
+ return false;
+
+ STAT_BUMP(frcti, ooo_rcv);
+
+ /* SACK carries cum-ACK; bound by t_a like any other ACK. */
+ if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a))
+ return false;
+
+ /* srtt/8 gate starved recovery under burst loss; floor to save CPU. */
+ min_gap = (uint64_t) SACK_MIN_GAP_NS;
+
+ if (!ts_aged_ns(now_ns, frcti->t_snd_sack, min_gap))
+ return false;
+
+ out->dsack = false;
+ n = dsack_consume(frcti, out->blocks);
+ if (n == 1)
+ out->dsack = true;
+ n += sack_blocks_build(frcti, out->blocks + n,
+ frcti->sack_n_max - n);
+
+ if (!out->dsack
+ && rcv_cr->lwe == frcti->sack_lwe && n == frcti->sack_n)
+ return false;
+
+ out->n = n;
+ out->ack = rcv_cr->lwe;
+ out->rwe = frcti_advert_rwe(frcti);
+ frcti->t_snd_sack = now_ns;
+ frcti->sack_lwe = rcv_cr->lwe;
+ frcti->sack_n = n;
+
+ return true;
+}
+
+/* Wire-dup of fresh DATA at an already-ACKed seqno. */
+static __inline__ bool is_dup_data(uint16_t flags,
+ uint32_t seqno,
+ uint32_t lwe)
+{
+ if (!(flags & FRCT_DATA))
+ return false;
+
+ if (flags & FRCT_RXM)
+ return false;
+
+ return before(seqno, lwe);
+}
+
+/*
+ * Wire-dup ACK packet: same seqno as the previous emission. Updates
+ * the dedup ackno on a fresh ACK; caller drops on true.
+ */
+static __inline__ bool is_dup_ack(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t seqno)
+{
+ if (flags & FRCT_DATA)
+ return false;
+
+ if (!(flags & FRCT_ACK))
+ return false;
+
+ if (seqno == frcti->rcv_cr.ackno)
+ return true;
+
+ frcti->rcv_cr.ackno = seqno;
+
+ return false;
+}
+
+/* Caller wrlock. */
+__attribute__((cold))
+static void seqno_rotate(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+
+ if (!ts_aged_ns(now_ns, snd_cr->act, snd_cr->inact))
+ return;
+ /* Idle-on-wire ≠ idle e2e: don't orphan in-flight rxm. */
+ if (snd_cr->seqno != snd_cr->lwe)
+ return;
+
+ /* Avoid colliding with peer's current rcv window. */
+ do {
+ random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno));
+ } while (in_window(snd_cr->seqno, snd_cr));
+ STORE_RELEASE(&snd_cr->lwe, snd_cr->seqno);
+ STORE_RELAXED(&snd_cr->rwe, snd_cr->lwe + START_WINDOW);
+ frcti->rtt_lwe = snd_cr->seqno;
+ frcti->in_recovery = false;
+ frcti->recovery_high = snd_cr->seqno;
+}
+
+__attribute__((hot))
+static int frcti_snd(struct frcti * frcti,
+ struct ssm_pk_buff * spb,
+ uint16_t flags)
+{
+ struct frct_pci * pci;
+ struct frct_pci_stream * spci = NULL;
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ uint32_t seqno;
+ uint16_t pci_flags = 0;
+ bool rtx;
+ uint64_t now_ns;
+ int64_t rcv_idle;
+ uint32_t probe_id = 0;
+ uint8_t probe_nonce[RTTP_NONCE_LEN] = { 0 };
+ bool probe;
+ size_t payload_len = 0;
assert(frcti);
- assert(ssm_pk_buff_len(spb) != 0);
+ /* Stream mode permits 0-byte sends for the EOS marker. */
+ assert(ssm_pk_buff_len(spb) != 0 || frcti->stream);
snd_cr = &frcti->snd_cr;
rcv_cr = &frcti->rcv_cr;
- timerwheel_move();
+ tw_move_safe();
+
+ if (frcti->stream)
+ payload_len = ssm_pk_buff_len(spb);
- pci = (struct frct_pci *) ssm_pk_buff_head_alloc(spb, FRCT_PCILEN);
+ pci = FRCT_HDR_PUSH(spb, frcti);
if (pci == NULL)
return -ENOMEM;
- memset(pci, 0, sizeof(*pci));
+ memset(pci, 0, FRCT_PCILEN);
+
+ if (frcti->stream)
+ spci = FRCT_SPCI(pci);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
pthread_rwlock_wrlock(&frcti->lock);
rtx = snd_cr->cflags & FRCTFRTX;
- pci->flags |= FRCT_DATA;
+ pci_flags |= FRCT_DATA;
+ if (!frcti->stream)
+ pci_flags |= (flags & FRCT_FR_MASK);
- /* Set DRF if there are no unacknowledged packets. */
- if (snd_cr->seqno == snd_cr->lwe)
- pci->flags |= FRCT_DRF;
+ if (!frcti->stream && (flags & FRCT_FR_MASK) != FRCT_FR_SOLE)
+ STAT_BUMP(frcti, frag_snd);
- /* Choose a new sequence number if sender inactivity expired. */
- if (now.tv_sec - snd_cr->act.tv_sec > snd_cr->inact) {
- /* There are no unacknowledged packets. */
- assert(snd_cr->seqno == snd_cr->lwe);
- random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno));
- snd_cr->lwe = snd_cr->seqno;
- snd_cr->rwe = snd_cr->lwe + START_WINDOW;
+ if (frcti->stream) {
+ if (flags & FRCT_FIN)
+ pci_flags |= FRCT_FIN;
+
+ spci->start = hton32(frcti->snd_byte_next);
+ frcti->snd_byte_next += (uint32_t) payload_len;
+ spci->end = hton32(frcti->snd_byte_next);
+ STAT_ADD(frcti, strm_snd_byte, payload_len);
}
+ if (snd_cr->seqno == snd_cr->lwe)
+ pci_flags |= FRCT_DRF;
+
+ seqno_rotate(frcti, now_ns);
+
seqno = snd_cr->seqno;
pci->seqno = hton32(seqno);
- if (now.tv_sec - rcv_cr->act.tv_sec < rcv_cr->inact) {
- pci->flags |= FRCT_FC;
- *((uint32_t *) pci) |= hton32(rcv_cr->rwe & 0x00FFFFFF);
+ rcv_idle = ts_age_ns(now_ns, rcv_cr->act);
+
+ if (rcv_idle < (int64_t) rcv_cr->inact) {
+ pci_flags |= FRCT_FC;
+ pci->window = hton32(frcti_advert_rwe(frcti));
}
if (!rtx) {
- snd_cr->lwe++;
+ STORE_RELEASE(&snd_cr->lwe, snd_cr->lwe + 1);
+ STORE_RELEASE(&snd_cr->rwe, snd_cr->lwe + RQ_SIZE);
} else {
- if (!frcti->probe) {
- frcti->rttseq = snd_cr->seqno;
- frcti->t_probe = now;
- frcti->probe = true;
-#ifdef PROC_FLOW_STATS
- frcti->n_prb++;
-#endif
- }
- if ((now.tv_sec - rcv_cr->act.tv_sec) * BILLION <= frcti->a) {
- pci->flags |= FRCT_ACK;
+ size_t p = RQ_SLOT(seqno);
+ frcti->snd_slots[p].time = now_ns;
+ /* Fresh send clears RTX bits. */
+ frcti->snd_slots[p].flags = 0;
+ if (rcv_idle <= (int64_t) frcti->t_a) {
+ pci_flags |= FRCT_ACK;
pci->ackno = hton32(rcv_cr->lwe);
rcv_cr->seqno = rcv_cr->lwe;
}
}
+ pci->flags = hton16(pci_flags);
+
+ frct_hcs_set(pci, frcti->stream);
+
snd_cr->seqno++;
- snd_cr->act = now;
+ STORE_RELEASE(&snd_cr->act, now_ns);
+
+ probe = rtt_probe_arm(frcti, now_ns, &probe_id, probe_nonce);
pthread_rwlock_unlock(&frcti->lock);
- if (rtx)
- timerwheel_rxm(frcti, seqno, spb);
+ if (probe)
+ frcti_rttp_snd(frcti, probe_id, 0, probe_nonce);
+
+ if (rtx) {
+ rxm_arm(frcti, seqno, spb);
+ tlp_arm(frcti);
+ }
return 0;
}
-static void rtt_estimator(struct frcti * frcti,
- time_t mrtt)
+/*
+ * Stream: 0-byte FRCT_FIN DATA so peer's flow_read returns 0 at this
+ * byte. Msg: control packet with FRCT_FIN flag, snd_cr.seqno carried
+ * in pci->ackno (sender packs via frcti_pkt_snd's ackno parameter).
+ */
+static void frcti_fin_snd(struct frcti * frcti)
{
- time_t srtt = frcti->srtt;
- time_t rttvar = frcti->mdev;
+ struct ssm_pk_buff * spb;
+ bool already;
+ uint32_t fin_seqno;
- if (srtt == 0) { /* first measurement */
- srtt = mrtt;
- rttvar = mrtt >> 1;
- } else {
- time_t delta = mrtt - srtt;
- srtt += (delta >> 3);
- delta = (ABS(delta) - rttvar) >> 2;
-#ifdef FRCT_LINUX_RTT_ESTIMATOR
- if (delta < 0)
- delta >>= 3;
-#endif
- rttvar += delta;
+ if (!(frcti->snd_cr.cflags & FRCTFLINGER))
+ return;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ already = frcti->snd_fin_sent;
+ frcti->snd_fin_sent = true;
+ fin_seqno = frcti->snd_cr.seqno;
+
+ if (!already && !frcti->stream)
+ frcti->snd_fin_seqno = fin_seqno;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ if (already)
+ return;
+
+ if (!frcti->stream) {
+ frcti_pkt_snd(frcti, FRCT_FIN, fin_seqno, 0);
+ return;
}
-#ifdef PROC_FLOW_STATS
- frcti->n_rtt++;
-#endif
- frcti->srtt = MAX(1000L, srtt);
- frcti->mdev = MAX(100L, rttvar);
- frcti->rto = MAX(RTO_MIN, frcti->srtt + (frcti->mdev << MDEV_MUL));
-}
-
-/* Always queues the next application packet on the RQ. */
-static void __frcti_rcv(struct frcti * frcti,
- struct ssm_pk_buff * spb)
-{
- ssize_t idx;
- size_t pos;
- struct frct_pci * pci;
- struct timespec now;
- struct frct_cr * rcv_cr;
- struct frct_cr * snd_cr;
- uint32_t seqno;
- uint32_t ackno;
- uint32_t rwe;
- int fd = -1;
- assert(frcti);
+ if (frct_spb_reserve(frcti_data_hdr_len(frcti), &spb) < 0)
+ return;
+ /* Reset spb to 0-len so frcti_snd's head_alloc populates PCI. */
+ ssm_pk_buff_truncate(spb, 0);
+
+ if (frcti_snd(frcti, spb, FRCT_FIN) < 0) {
+ frct_spb_release(spb);
+ return;
+ }
+
+ if (frct_tx(frcti, spb) < 0)
+ return;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ frcti->snd_fin_seqno = frcti->snd_cr.seqno - 1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+static bool final_ack_due(struct frcti * frcti,
+ struct frct_cr * rcv_cr,
+ uint64_t now_ns)
+{
+ if (rcv_cr->lwe == rcv_cr->seqno)
+ return false;
+
+ if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a))
+ return false;
+
+ return true;
+}
+
+/* Snd-side has FLINGER cflag and unACK'd data below the FIN/seqno. */
+static __inline__ bool snd_drain_pending(struct frct_cr * snd_cr,
+ uint32_t edge)
+{
+ if (!(snd_cr->cflags & FRCTFLINGER))
+ return false;
+
+ return before(snd_cr->lwe, edge);
+}
+
+/* Peer is still active and we haven't seen their FIN yet. */
+static __inline__ bool rcv_drain_pending(struct frcti * frcti,
+ struct frct_cr * rcv_cr,
+ uint64_t now_ns)
+{
+ if (frcti->rcv_fin_seen)
+ return false;
+
+ return !ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact);
+}
+
+/* Drain-loop predicate: snd-side unACK'd data OR peer still active. */
+static bool frcti_lingering(struct frcti * frcti)
+{
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ uint32_t edge;
+ uint64_t now_ns;
+ bool snd_linger;
+ bool rcv_linger;
+
+ /* Idempotent; emits FIN once per side, both stream and msg. */
+ frcti_fin_snd(frcti);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_rdlock(&frcti->lock);
+
+ snd_cr = &frcti->snd_cr;
rcv_cr = &frcti->rcv_cr;
+
+ if (frcti->snd_fin_sent)
+ edge = frcti->snd_fin_seqno;
+ else
+ edge = snd_cr->seqno;
+
+ snd_linger = snd_drain_pending(snd_cr, edge);
+ rcv_linger = rcv_drain_pending(frcti, rcv_cr, now_ns);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return snd_linger || rcv_linger;
+}
+
+static time_t frcti_dealloc(struct frcti * frcti)
+{
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ int ackno;
+ bool due;
+ int64_t now_ns;
+ int64_t rcv;
+ int64_t snd;
+
snd_cr = &frcti->snd_cr;
+ rcv_cr = &frcti->rcv_cr;
+
+ /* Idempotent; usually already sent by frcti_lingering. */
+ frcti_fin_snd(frcti);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
- pci = (struct frct_pci *) ssm_pk_buff_head_release(spb, FRCT_PCILEN);
+ pthread_rwlock_rdlock(&frcti->lock);
- idx = ssm_pk_buff_get_idx(spb);
- seqno = ntoh32(pci->seqno);
- pos = seqno & (RQ_SIZE - 1);
+ ackno = rcv_cr->lwe;
+ rcv = (int64_t)(rcv_cr->act + rcv_cr->inact) - now_ns;
+ snd = (int64_t)(snd_cr->act + snd_cr->inact) - now_ns;
+ due = final_ack_due(frcti, rcv_cr, now_ns);
- pthread_rwlock_wrlock(&frcti->lock);
+ pthread_rwlock_unlock(&frcti->lock);
- if (now.tv_sec - rcv_cr->act.tv_sec > rcv_cr->inact) {
- if (pci->flags & FRCT_DRF) { /* New run. */
- rcv_cr->lwe = seqno;
- rcv_cr->rwe = seqno + RQ_SIZE;
- rcv_cr->seqno = seqno;
- } else if (pci->flags & FRCT_DATA) {
- goto drop_packet;
- }
- }
+ if (due)
+ frcti_pkt_snd(frcti, FRCT_ACK, ackno, 0);
- rcv_cr->act = now;
+ return (time_t) MAX((MAX(rcv, snd) / BILLION), 0);
+}
- /* For now, just send an immediate window update. */
- if (pci->flags & FRCT_RDVS) {
- fd = frcti->fd;
- rwe = rcv_cr->rwe;
- pthread_rwlock_unlock(&frcti->lock);
+__attribute__((hot))
+static void frcti_rcv(struct frcti * frcti,
+ struct ssm_pk_buff * spb)
+{
+ ssize_t idx;
+ size_t pos;
+ struct frct_pci * pci;
+ struct timespec now;
+ uint64_t now_ns;
+ struct frct_cr * rcv_cr;
+ uint32_t seqno;
+ uint16_t flags;
+ buffer_t pkt;
+ struct pending pending = { 0 };
+ bool in_order;
+ struct sack_args * sa = NULL;
+ bool send_sack = false;
- __send_frct_pkt(fd, FRCT_FC, 0, rwe);
+ assert(frcti);
+
+ rcv_cr = &frcti->rcv_cr;
- ssm_pool_remove(proc.pool, idx);
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (ssm_pk_buff_len(spb) < FRCT_PCILEN) {
+ frct_spb_release(spb);
return;
}
- if (pci->flags & FRCT_ACK) {
- ackno = ntoh32(pci->ackno);
- if (after(ackno, frcti->snd_cr.lwe))
- frcti->snd_cr.lwe = ackno;
+ pci = FRCT_HDR_POP(spb, frct_pci);
- if (frcti->probe && after(ackno, frcti->rttseq)) {
-#ifdef PROC_FLOW_STATS
- if (!(pci->flags & FRCT_DATA))
- frcti->n_dak++;
-#endif
- rtt_estimator(frcti, ts_diff_ns(&now, &frcti->t_probe));
- frcti->probe = false;
- }
+ idx = ssm_pk_buff_get_off(spb);
+ seqno = ntoh32(pci->seqno);
+ pos = RQ_SLOT(seqno);
+
+ flags = ntoh16(pci->flags);
+
+ pkt.data = ssm_pk_buff_head(spb);
+ pkt.len = ssm_pk_buff_len(spb);
+
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_rcv);
+
+ /* Stateless / lock-free dispatches. spb released via ctrl_done. */
+ if (flags & FRCT_KA) {
+ frcti_ka_rcv(frcti, pci, now_ns, flags);
+ goto ctrl_done;
}
- if (pci->flags & FRCT_FC) {
- uint32_t rwe;
+ if (flags & FRCT_RTTP) {
+ frcti_rttp_rcv(frcti, pkt, now_ns);
+ goto ctrl_done;
+ }
- rwe = ntoh32(*((uint32_t *)pci) & hton32(0x00FFFFFF));
- rwe |= snd_cr->rwe & 0xFF000000;
+ if (flags & FRCT_NACK) {
+ frcti_nack_rcv(frcti);
+ goto ctrl_done;
+ }
+
+ if (flags & FRCT_RDVS) {
+ frcti_rdv_rcv(frcti);
+ goto ctrl_done;
+ }
- /* Rollover for 24 bit */
- if (before(rwe, snd_cr->rwe) && snd_cr->rwe - rwe > 0x007FFFFF)
- rwe += 0x01000000;
+ /* Msg-mode FIN: control packet, FIN seqno carried in pci->ackno. */
+ if ((flags & FRCT_FIN) && !(flags & FRCT_DATA)) {
+ pthread_rwlock_wrlock(&frcti->lock);
+ if (!frcti->rcv_fin_seen) {
+ frcti->rcv_fin_seen = true;
+ frcti->rcv_byte_fin = ntoh32(pci->ackno);
+ }
+ pthread_rwlock_unlock(&frcti->lock);
+ goto ctrl_done;
+ }
- snd_cr->rwe = rwe;
+ pthread_rwlock_wrlock(&frcti->lock);
- pthread_mutex_lock(&frcti->mtx);
- if (!frcti->open) {
- frcti->open = true;
- pthread_cond_broadcast(&frcti->cond);
+ /* rcv_inact_check is a no-op for non-DATA non-DRF packets. */
+ if (flags & (FRCT_DATA | FRCT_DRF)) {
+ switch (rcv_inact_check(frcti, flags, seqno, now_ns)) {
+ case FRCT_INACT_NEED_NACK:
+ pthread_rwlock_unlock(&frcti->lock);
+ frcti_nack_snd(frcti, seqno - 1);
+ frct_spb_release(spb);
+ return;
+ case FRCT_INACT_DROP:
+ STAT_BUMP(frcti, inact_drop);
+ goto drop_packet;
+ case FRCT_ACTIVE:
+ /* FALLTHRU */
+ default:
+ break;
}
- pthread_mutex_unlock(&frcti->mtx);
}
- if (!(pci->flags & FRCT_DATA))
+ /* DATA-only act refresh: non-DATA would lock out DRF rebase. */
+ if (flags & FRCT_DATA)
+ STORE_RELEASE(&rcv_cr->act, now_ns);
+
+ /* Wire-dup ACK packet: same seqno as the previous emission. */
+ if (is_dup_ack(frcti, flags, seqno)) {
+ STAT_BUMP(frcti, ack_dup_rcv);
+ goto drop_packet;
+ }
+
+ /* Wire-dup of DATA: piggybacked ACK info already processed. */
+ if (is_dup_data(flags, seqno, rcv_cr->lwe)) {
+ rcv_cr->seqno = seqno;
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 1: dup below cum-ACK. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
+ goto drop_packet;
+ }
+
+ if (flags & FRCT_ACK)
+ frcti_ack_rcv(frcti, pci, flags, now_ns, &pending);
+
+ if (flags & FRCT_SACK)
+ frcti_sack_rcv(frcti, pkt, ntoh32(pci->ackno),
+ now_ns, &pending);
+
+ if (flags & FRCT_FC)
+ frcti_fc_rcv(frcti, pci);
+
+ if (!(flags & FRCT_DATA))
goto drop_packet;
if (before(seqno, rcv_cr->lwe)) {
- rcv_cr->seqno = seqno; /* Ensures we send a new ACK. */
-#ifdef PROC_FLOW_STATS
- frcti->n_dup++;
-#endif
+ /* Bump rcv_cr.seqno to force ack_snd to fire on the dup. */
+ rcv_cr->seqno = seqno;
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_dup_rcv);
+ else
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 1: dup below cum-ACK. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
goto drop_packet;
}
- if (rcv_cr->cflags & FRCTFRTX) {
+ if (!rq_accept(frcti, seqno, pos, flags))
+ goto drop_packet;
- if (!before(seqno, rcv_cr->rwe)) { /* Out of window. */
-#ifdef PROC_FLOW_STATS
- frcti->n_out++;
-#endif
+ if (frcti->stream) {
+ if (frcti_stream_data_rcv(frcti, spb, pos, flags) < 0) {
+ STAT_BUMP(frcti, strm_drop);
goto drop_packet;
}
-
- if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) {
-#ifdef PROC_FLOW_STATS
- frcti->n_rqo++;
-#endif
- goto drop_packet; /* Out of rq. */
- }
- if (frcti->rq[pos] != -1) {
-#ifdef PROC_FLOW_STATS
- frcti->n_dup++;
-#endif
- goto drop_packet; /* Duplicate in rq. */
- }
- fd = frcti->fd;
+ /* spb consumed by stash; do not release in drop path. */
+ spb = NULL;
} else {
- rcv_cr->lwe = seqno;
+ frcti_data_stash(frcti, idx, pos, flags);
+ }
+
+ /* Lazy alloc: only OOO arrivals can trigger a SACK send. */
+ if (after(seqno, rcv_cr->lwe) && frcti->sack_n_max > 0) {
+ size_t sa_sz = sizeof(*sa)
+ + frcti->sack_n_max * sizeof(sa->blocks[0]);
+ sa = malloc(sa_sz);
+ /* If alloc fails, sack_check sees NULL and we skip SACK. */
}
- frcti->rq[pos] = idx;
+ send_sack = sa != NULL && sack_check(frcti, seqno, now_ns, sa);
+ in_order = !after(seqno, rcv_cr->lwe);
pthread_rwlock_unlock(&frcti->lock);
- if (fd != -1)
- timerwheel_delayed_ack(fd, frcti);
+ if (send_sack) {
+ STAT_BUMP(frcti, sack_snd);
+ if (sa->dsack)
+ STAT_BUMP(frcti, dsack_snd);
+ frcti_sack_snd(frcti, sa);
+ } else if (in_order) {
+ ack_arm(frcti);
+ }
+
+ if ((flags & FRCT_ACK) && frcti->snd_cr.seqno != frcti->snd_cr.lwe)
+ tlp_arm(frcti);
+
+ pending_flush(frcti, &pending);
+
+ frcti_rcv_probe(frcti, now_ns);
+ free(sa);
+ return;
+
+ ctrl_done:
+ frct_spb_release(spb);
return;
drop_packet:
pthread_rwlock_unlock(&frcti->lock);
- ssm_pool_remove(proc.pool, idx);
- send_frct_pkt(frcti);
- return;
+ frct_spb_release(spb);
+ /* with_sack=true: ack_snd no-ops if neither dsack nor SACK is due. */
+ ack_snd(frcti, true);
+
+ pending_flush(frcti, &pending);
+ free(sa);
}
+
+/* NULL-shim macros for the no-FRCT case. */
+
+#define FRCTI_SND(frcti, spb, flags) \
+ ((frcti) == NULL ? 0 : frcti_snd((frcti), (spb), (flags)))
+
+#define FRCTI_RCV(frcti, spb) \
+ do { \
+ if ((frcti) != NULL) \
+ frcti_rcv((frcti), (spb)); \
+ } while (0)
+
+#define FRCTI_PDU_READY(frcti) \
+ ((frcti) != NULL && frcti_pdu_ready(frcti))
+
+#define FRCTI_CONSUME(frcti, buf, count) \
+ ((frcti) == NULL ? (ssize_t) -EAGAIN \
+ : (frcti)->stream \
+ ? frcti_consume_stream((frcti), (buf), (count)) \
+ : frcti_consume((frcti), (buf), (count)))
+
+#define FRCTI_IS_FRTX(frcti) \
+ ((frcti) != NULL && ((frcti)->rcv_cr.cflags & FRCTFRTX))
+
+#define FRCTI_IS_STREAM(frcti) ((frcti) != NULL && (frcti)->stream)
+
+#define FRCTI_PAYLOAD_CAP(frcti) \
+ ((frcti)->frag_mtu - frcti_data_hdr_len(frcti))
+
+#define FRCTI_NEEDS_FRAG(frcti, count) \
+ ((frcti) != NULL && (count) > FRCTI_PAYLOAD_CAP(frcti))
+
+#define FRCTI_IS_WINDOW_OPEN(frcti) \
+ ((frcti) == NULL ? true : frcti_is_window_open(frcti))
+
+#define FRCTI_IS_WINDOW_OPEN_N(frcti, n) \
+ ((frcti) == NULL ? true : frcti_is_window_open_n((frcti), (n)))
+
+#define FRCTI_LINGERING(frcti) \
+ ((frcti) == NULL ? false : frcti_lingering(frcti))
+
+#define FRCTI_DEALLOC(frcti) \
+ ((frcti) == NULL ? (time_t) 0 : frcti_dealloc(frcti))
+
diff --git a/src/lib/hash.c b/src/lib/hash.c
index 7adee968..62bbf2b8 100644
--- a/src/lib/hash.c
+++ b/src/lib/hash.c
@@ -39,6 +39,9 @@
#include <ouroboros/md5.h>
#include <ouroboros/sha3.h>
#endif
+#include <ouroboros/crc8.h>
+#include <ouroboros/crc16.h>
+#include <ouroboros/crc64.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
@@ -69,6 +72,12 @@ int hash_len_tbl [] = {
uint16_t hash_len(enum hash_algo algo)
{
+ if (algo == HASH_CRC8)
+ return CRC8_HASH_LEN;
+ if (algo == HASH_CRC16)
+ return CRC16_HASH_LEN;
+ if (algo == HASH_CRC64)
+ return CRC64_HASH_LEN;
#ifdef HAVE_LIBGCRYPT
return (uint16_t) gcry_md_get_algo_dlen(gcry_algo_tbl[algo]);
#else
@@ -81,12 +90,34 @@ void mem_hash(enum hash_algo algo,
const uint8_t * buf,
size_t len)
{
-#ifdef HAVE_LIBGCRYPT
- gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len);
-#else
+#ifndef HAVE_LIBGCRYPT
struct sha3_ctx sha3_ctx;
struct md5_ctx md5_ctx;
+#endif
+ if (algo == HASH_CRC8) {
+ uint8_t crc = 0;
+
+ crc8_autosar(&crc, buf, len);
+ *(uint8_t *) dst = crc;
+ return;
+ }
+ if (algo == HASH_CRC16) {
+ uint16_t crc = 0;
+ crc16_ccitt_false(&crc, buf, len);
+ *(uint16_t *) dst = htobe16(crc);
+ return;
+ }
+ if (algo == HASH_CRC64) {
+ uint64_t crc = 0;
+
+ crc64_nvme(&crc, buf, len);
+ *(uint64_t *) dst = htobe64(crc);
+ return;
+ }
+#ifdef HAVE_LIBGCRYPT
+ gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len);
+#else
switch (algo) {
case HASH_CRC32:
memset(dst, 0, CRC32_HASH_LEN);
diff --git a/src/lib/irm.c b/src/lib/irm.c
index 594014f7..c62701aa 100644
--- a/src/lib/irm.c
+++ b/src/lib/irm.c
@@ -614,7 +614,7 @@ ssize_t irm_list_names(struct name_info ** names)
return 0;
}
- *names = malloc(nr * sizeof(**names));
+ *names = calloc(nr, sizeof(**names));
if (*names == NULL) {
irm_msg__free_unpacked(recv_msg, NULL);
return -ENOMEM;
diff --git a/src/lib/pb/ipcp.proto b/src/lib/pb/ipcp.proto
index 9dc402f5..406b8d9c 100644
--- a/src/lib/pb/ipcp.proto
+++ b/src/lib/pb/ipcp.proto
@@ -54,7 +54,7 @@ message ipcp_msg {
optional int32 response = 10;
optional string comp = 11;
optional uint32 timeo_sec = 12;
- optional sint32 mpl = 13;
+ optional sint32 mpl = 13; /* MPL in ms. */
optional int32 result = 14;
optional uint32 uid = 15; /* 0 = GSPP, >0 = PUP uid */
}
diff --git a/src/lib/pb/irm.proto b/src/lib/pb/irm.proto
index 9ed0a29b..5de860a5 100644
--- a/src/lib/pb/irm.proto
+++ b/src/lib/pb/irm.proto
@@ -88,12 +88,12 @@ message irm_msg {
repeated ipcp_list_msg ipcps = 17;
repeated name_info_msg names = 18;
optional timespec_msg timeo = 19;
- optional sint32 mpl = 20;
+ optional sint32 mpl = 20; /* MPL in ms. */
optional string comp = 21;
optional bytes pk = 22; /* piggyback */
optional uint32 timeo_sec = 23;
optional uint32 timeo_nsec = 24;
optional sint32 result = 25;
- optional bytes sym_key = 26; /* symmetric encryption key */
- optional sint32 cipher_nid = 27; /* cipher NID */
+ optional bytes sym_key = 26; /* symmetric encryption key */
+ optional sint32 cipher_nid = 27; /* cipher NID */
}
diff --git a/src/lib/pb/model.proto b/src/lib/pb/model.proto
index f1382f3d..4c1564a5 100644
--- a/src/lib/pb/model.proto
+++ b/src/lib/pb/model.proto
@@ -28,7 +28,7 @@ message qosspec_msg {
required uint32 availability = 3; /* Class of 9s. */
required uint32 loss = 4; /* Packet loss. */
required uint32 ber = 5; /* Bit error rate, ppb. */
- required uint32 in_order = 6; /* In-order delivery. */
+ required uint32 service = 6; /* enum qos_service. */
required uint32 max_gap = 7; /* In ms. */
required uint32 timeout = 8; /* Timeout in ms. */
}
@@ -37,10 +37,11 @@ message flow_info_msg {
required uint32 id = 1;
required uint32 n_pid = 2;
required uint32 n_1_pid = 3;
- required uint32 mpl = 4;
+ required uint32 mpl = 4; /* MPL in ms. */
required uint32 state = 5;
required qosspec_msg qos = 6;
required uint32 uid = 7;
+ required uint32 mtu = 8; /* Layer MTU (bytes). */
}
message name_info_msg {
diff --git a/src/lib/protobuf.c b/src/lib/protobuf.c
index d419a9f1..a824d357 100644
--- a/src/lib/protobuf.c
+++ b/src/lib/protobuf.c
@@ -81,6 +81,7 @@ flow_info_msg_t * flow_info_s_to_msg(const struct flow_info * s)
msg->mpl = s->mpl;
msg->state = s->state;
msg->uid = s->uid;
+ msg->mtu = s->mtu;
msg->qos = qos_spec_s_to_msg(&s->qs);
if (msg->qos == NULL)
goto fail_msg;
@@ -107,6 +108,7 @@ struct flow_info flow_info_msg_to_s(const flow_info_msg_t * msg)
s.mpl = msg->mpl;
s.state = msg->state;
s.uid = msg->uid;
+ s.mtu = msg->mtu;
s.qs = qos_spec_msg_to_s(msg->qos);
return s;
@@ -137,7 +139,7 @@ name_info_msg_t * name_info_s_to_msg(const struct name_info * info)
goto fail_msg;
msg->ckey = strdup(info->c.key);
- if (msg->skey == NULL)
+ if (msg->ckey == NULL)
goto fail_msg;
msg->ccrt = strdup(info->c.crt);
@@ -161,6 +163,8 @@ struct name_info name_info_msg_to_s(const name_info_msg_t * msg)
assert(msg != NULL);
assert(strlen(msg->name) <= NAME_SIZE);
+ memset(&s, 0, sizeof(s));
+
strcpy(s.name, msg->name);
strcpy(s.s.key, msg->skey);
strcpy(s.s.crt, msg->scrt);
@@ -755,7 +759,7 @@ qosspec_msg_t * qos_spec_s_to_msg(const struct qos_spec * s)
msg->availability = s->availability;
msg->loss = s->loss;
msg->ber = s->ber;
- msg->in_order = s->in_order;
+ msg->service = s->service;
msg->max_gap = s->max_gap;
msg->timeout = s->timeout;
@@ -773,7 +777,7 @@ struct qos_spec qos_spec_msg_to_s(const qosspec_msg_t * msg)
s.availability = msg->availability;
s.loss = msg->loss;
s.ber = msg->ber;
- s.in_order = msg->in_order;
+ s.service = msg->service;
s.max_gap = msg->max_gap;
s.timeout = msg->timeout;
diff --git a/src/lib/qoscube.c b/src/lib/qoscube.c
index 1eaa0d7c..5d7ae17d 100644
--- a/src/lib/qoscube.c
+++ b/src/lib/qoscube.c
@@ -29,15 +29,11 @@
qoscube_t qos_spec_to_cube(qosspec_t qs)
{
- if (qs.delay <= qos_voice.delay &&
- qs.bandwidth <= qos_voice.bandwidth &&
- qs.availability >= qos_voice.availability &&
- qs.max_gap <= qos_voice.max_gap)
+ if (qs.delay <= 50 && qs.bandwidth <= 100000
+ && qs.availability >= 5 && qs.max_gap <= 50)
return QOS_CUBE_VOICE;
- else if (qs.delay <= qos_video.delay &&
- qs.bandwidth <= qos_video.bandwidth &&
- qs.availability >= qos_video.availability &&
- qs.max_gap <= qos_video.max_gap)
+ else if (qs.delay <= 100 && qs.availability >= 3
+ && qs.max_gap <= 100)
return QOS_CUBE_VIDEO;
else
return QOS_CUBE_BE;
diff --git a/src/lib/random.c b/src/lib/random.c
index 96315132..2c9a6c0d 100644
--- a/src/lib/random.c
+++ b/src/lib/random.c
@@ -47,8 +47,9 @@ int random_buffer(void * buf,
gcry_randomize(buf, len, GCRY_STRONG_RANDOM);
return 0;
#elif defined(HAVE_OPENSSL_RNG)
- if (len > 0 && len < INT_MAX)
- return RAND_bytes((unsigned char *) buf, (int) len);
- return -1;
+ if (len == 0 || len >= INT_MAX)
+ return -1;
+
+ return RAND_bytes((unsigned char *) buf, (int) len) == 1 ? 0 : -1;
#endif
}
diff --git a/src/lib/rib.c b/src/lib/rib.c
index a8d535c9..6e421397 100644
--- a/src/lib/rib.c
+++ b/src/lib/rib.c
@@ -112,14 +112,14 @@ static int rib_read(const char * path,
(void) info;
(void) offset;
- pthread_rwlock_wrlock(&rib.lock);
+ pthread_rwlock_rdlock(&rib.lock);
list_for_each(p, &rib.reg_comps) {
struct reg_comp * r = list_entry(p, struct reg_comp, next);
if (strcmp(comp, r->path) == 0) {
- int ret = r->ops->read(path + 1, buf, size);
+ struct rib_ops * ops = r->ops;
pthread_rwlock_unlock(&rib.lock);
- return ret;
+ return ops->read(path + 1, buf, size);
}
}
@@ -160,19 +160,25 @@ static int rib_readdir(const char * path,
ssize_t len;
ssize_t i;
struct reg_comp * c;
+ struct rib_ops * ops;
c = list_entry(p, struct reg_comp, next);
if (strcmp(path + 1, c->path) != 0)
continue;
- assert(c->ops->readdir != NULL);
+ ops = c->ops;
+
+ assert(ops->readdir != NULL);
+
+ pthread_rwlock_unlock(&rib.lock);
- len = c->ops->readdir(&dir_entries);
+ len = ops->readdir(&dir_entries);
if (len < 0)
- break;
+ return 0;
for (i = 0; i < len; ++i)
filler(buf, dir_entries[i], NULL, 0);
freepp(char, dir_entries, len);
+ return 0;
}
}
diff --git a/src/lib/ssm/flow_set.c b/src/lib/ssm/flow_set.c
index 73d0db55..cb38e6fd 100644
--- a/src/lib/ssm/flow_set.c
+++ b/src/lib/ssm/flow_set.c
@@ -58,9 +58,9 @@
#define QUEUESIZE ((SSM_RBUFF_SIZE) * sizeof(struct flowevent))
#define SSM_FSET_FILE_SIZE (SYS_MAX_FLOWS * sizeof(ssize_t) \
- + PROG_MAX_FQUEUES * sizeof(size_t) \
- + PROG_MAX_FQUEUES * sizeof(pthread_cond_t) \
- + PROG_MAX_FQUEUES * QUEUESIZE \
+ + PROC_MAX_FQUEUES * sizeof(size_t) \
+ + PROC_MAX_FQUEUES * sizeof(pthread_cond_t) \
+ + PROC_MAX_FQUEUES * QUEUESIZE \
+ sizeof(pthread_mutex_t))
#define fqueue_ptr(fs, idx) (fs->fqueues + (SSM_RBUFF_SIZE) * idx)
@@ -104,10 +104,10 @@ static struct ssm_flow_set * flow_set_create(pid_t pid,
set->mtable = shm_base;
set->heads = (size_t *) (set->mtable + SYS_MAX_FLOWS);
- set->conds = (pthread_cond_t *)(set->heads + PROG_MAX_FQUEUES);
- set->fqueues = (struct flowevent *) (set->conds + PROG_MAX_FQUEUES);
+ set->conds = (pthread_cond_t *)(set->heads + PROC_MAX_FQUEUES);
+ set->fqueues = (struct flowevent *) (set->conds + PROC_MAX_FQUEUES);
set->lock = (pthread_mutex_t *)
- (set->fqueues + PROG_MAX_FQUEUES * (SSM_RBUFF_SIZE));
+ (set->fqueues + PROC_MAX_FQUEUES * (SSM_RBUFF_SIZE));
return set;
@@ -164,7 +164,7 @@ struct ssm_flow_set * ssm_flow_set_create(pid_t pid)
if (pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK))
goto fail_condattr_set;
#endif
- for (i = 0; i < PROG_MAX_FQUEUES; ++i) {
+ for (i = 0; i < PROC_MAX_FQUEUES; ++i) {
set->heads[i] = 0;
if (pthread_cond_init(&set->conds[i], &cattr))
goto fail_init;
@@ -222,7 +222,7 @@ void ssm_flow_set_zero(struct ssm_flow_set * set,
ssize_t i = 0;
assert(set);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -242,7 +242,7 @@ int ssm_flow_set_add(struct ssm_flow_set * set,
{
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -264,7 +264,7 @@ void ssm_flow_set_del(struct ssm_flow_set * set,
{
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -282,7 +282,7 @@ int ssm_flow_set_has(struct ssm_flow_set * set,
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -332,7 +332,7 @@ ssize_t ssm_flow_set_wait(const struct ssm_flow_set * set,
ssize_t ret = 0;
assert(set);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
assert(fqueue);
#ifndef HAVE_ROBUST_MUTEX
diff --git a/src/lib/ssm/pool.c b/src/lib/ssm/pool.c
index 5c98b515..5607a360 100644
--- a/src/lib/ssm/pool.c
+++ b/src/lib/ssm/pool.c
@@ -24,6 +24,7 @@
#include "config.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/errno.h>
#include <ouroboros/pthread.h>
#include <ouroboros/ssm_pool.h>
@@ -75,26 +76,6 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = {
#define GET_SHARD_FOR_PID(pid) ((int)((pid) % SSM_POOL_SHARDS))
-#define LOAD_RELAXED(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_RELAXED))
-
-#define LOAD_ACQUIRE(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_ACQUIRE))
-
-#define STORE_RELEASE(ptr, val) \
- (__atomic_store_n(ptr, val, __ATOMIC_RELEASE))
-
-#define LOAD(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_SEQ_CST))
-
-#define STORE(ptr, val) \
- (__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST))
-
-#define FETCH_ADD(ptr, val) \
- (__atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST))
-
-#define FETCH_SUB(ptr, val) \
- (__atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST))
#define SSM_FILE_SIZE (SSM_POOL_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr))
#define SSM_GSPP_FILE_SIZE (SSM_GSPP_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr))
@@ -107,6 +88,8 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = {
: SSM_PUP_FILE_SIZE)
#define GET_POOL_CFG(uid) (IS_GSPP(uid) ? ssm_gspp_cfg : ssm_pup_cfg)
+#define NEEDS_CHOWN(uid, gid) ((uid) != geteuid() || (gid) != getegid())
+
struct ssm_pool {
uint8_t * shm_base; /* start of blocks */
struct _ssm_pool_hdr * hdr; /* shared memory header */
@@ -163,29 +146,6 @@ static __inline__ void list_add_head(struct _ssm_list_head * head,
STORE(&head->count, LOAD(&head->count) + 1);
}
-static __inline__ int select_size_class(struct ssm_pool * pool,
- size_t len)
-{
- size_t sz;
- int i;
-
- assert(pool != NULL);
-
- /* Total space needed: header + headspace + data + tailspace */
- sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_HEADSPACE + len
- + SSM_PK_BUFF_TAILSPACE;
-
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- struct _ssm_size_class * sc;
-
- sc = &pool->hdr->size_classes[i];
- if (sc->object_size > 0 && sz <= sc->object_size)
- return i;
- }
-
- return -1;
-}
-
static __inline__ int find_size_class_for_offset(struct ssm_pool * pool,
size_t offset)
{
@@ -548,7 +508,7 @@ static struct ssm_pool * __pool_create(const char * name,
if (flags & O_CREAT) {
if (ftruncate(fd, (off_t) file_size) < 0)
goto fail_truncate;
- if (uid != geteuid() && fchown(fd, uid, gid) < 0)
+ if (NEEDS_CHOWN(uid, gid) && fchown(fd, uid, gid) < 0)
goto fail_truncate;
}
@@ -700,7 +660,7 @@ ssize_t ssm_pool_alloc(struct ssm_pool * pool,
assert(pool != NULL);
assert(spb != NULL);
- idx = select_size_class(pool, count);
+ idx = select_size_class(pool->hdr, count);
if (idx >= 0)
return alloc_from_sc(pool, idx, count, ptr, spb);
@@ -718,7 +678,7 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
assert(pool != NULL);
assert(spb != NULL);
- idx = select_size_class(pool, count);
+ idx = select_size_class(pool->hdr, count);
if (idx >= 0)
return alloc_from_sc_b(pool, idx, count, ptr, spb, abstime);
@@ -744,7 +704,7 @@ ssize_t ssm_pool_read(uint8_t ** dst,
}
struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool,
- size_t off)
+ size_t off)
{
struct ssm_pk_buff * blk;
@@ -823,36 +783,36 @@ int ssm_pool_remove(struct ssm_pool * pool,
return 0;
}
-size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb)
+size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
return spb->off;
}
-uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb)
+uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
- return spb->data + spb->pk_head;
+ return (uint8_t *) spb->data + spb->pk_head;
}
-uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb)
+uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
- return spb->data + spb->pk_tail;
+ return (uint8_t *) spb->data + spb->pk_tail;
}
-size_t ssm_pk_buff_len(struct ssm_pk_buff * spb)
+size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
return spb->pk_tail - spb->pk_head;
}
-uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb,
+ size_t size)
{
assert(spb != NULL);
@@ -864,8 +824,8 @@ uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
return spb->data + spb->pk_head;
}
-uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb,
+ size_t size)
{
uint8_t * buf;
@@ -881,8 +841,8 @@ uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
return buf;
}
-uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb,
+ size_t size)
{
uint8_t * buf;
@@ -896,8 +856,8 @@ uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
return buf;
}
-uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb,
+ size_t size)
{
assert(spb != NULL);
assert(!(size > spb->pk_tail - spb->pk_head));
diff --git a/src/lib/ssm/rbuff.c b/src/lib/ssm/rbuff.c
index e4558c31..c149c306 100644
--- a/src/lib/ssm/rbuff.c
+++ b/src/lib/ssm/rbuff.c
@@ -80,6 +80,7 @@ struct ssm_rbuff {
pthread_cond_t * del; /* signal when data removed */
pid_t pid; /* pid of the owner */
int flow_id; /* flow_id of the flow */
+ size_t n_users; /* in-flight users */
};
#define MM_FLAGS (PROT_READ | PROT_WRITE)
@@ -119,6 +120,7 @@ static struct ssm_rbuff * rbuff_create(pid_t pid,
rb->del = rb->add + 1;
rb->pid = pid;
rb->flow_id = flow_id;
+ rb->n_users = 0;
return rb;
@@ -228,11 +230,20 @@ void ssm_rbuff_close(struct ssm_rbuff * rb)
{
assert(rb);
+ /*
+ * Caller must set ACL_FLOWDOWN first; if a user becomes
+ * cancellable, push a cleanup that decrements n_users.
+ */
+ while (__atomic_load_n(&rb->n_users, __ATOMIC_SEQ_CST) > 0) {
+ struct timespec tic = { 0, 100000 };
+ nanosleep(&tic, NULL);
+ }
+
rbuff_destroy(rb);
}
int ssm_rbuff_write(struct ssm_rbuff * rb,
- size_t idx)
+ size_t off)
{
size_t acl;
bool was_empty;
@@ -240,6 +251,8 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
if (acl != ACL_RDWR) {
if (acl & ACL_FLOWDOWN) {
@@ -261,7 +274,7 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
was_empty = IS_EMPTY(rb);
- HEAD(rb) = (ssize_t) idx;
+ HEAD(rb) = (ssize_t) off;
ADVANCE_HEAD(rb);
if (was_empty)
@@ -269,16 +282,18 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
pthread_mutex_unlock(rb->mtx);
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return 0;
fail_mutex:
pthread_mutex_unlock(rb->mtx);
fail_acl:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
int ssm_rbuff_write_b(struct ssm_rbuff * rb,
- size_t idx,
+ size_t off,
const struct timespec * abstime)
{
size_t acl;
@@ -287,6 +302,8 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
if (acl != ACL_RDWR) {
if (acl & ACL_FLOWDOWN) {
@@ -316,7 +333,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
if (ret != -ETIMEDOUT && ret != -EFLOWDOWN) {
was_empty = IS_EMPTY(rb);
- HEAD(rb) = (ssize_t) idx;
+ HEAD(rb) = (ssize_t) off;
ADVANCE_HEAD(rb);
if (was_empty)
pthread_cond_broadcast(rb->add);
@@ -325,6 +342,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
pthread_mutex_unlock(rb->mtx);
fail_acl:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
@@ -351,11 +369,21 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb)
assert(rb != NULL);
- if (IS_EMPTY(rb))
- return check_rb_acl(rb);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
+ if (IS_EMPTY(rb)) {
+ ret = check_rb_acl(rb);
+ goto out;
+ }
robust_mutex_lock(rb->mtx);
+ if (IS_EMPTY(rb)) {
+ pthread_mutex_unlock(rb->mtx);
+ ret = check_rb_acl(rb);
+ goto out;
+ }
+
ret = TAIL(rb);
ADVANCE_TAIL(rb);
@@ -363,6 +391,8 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb)
pthread_mutex_unlock(rb->mtx);
+ out:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
@@ -374,9 +404,13 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
- if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN))
- return -EFLOWDOWN;
+ if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN)) {
+ idx = -EFLOWDOWN;
+ goto out;
+ }
robust_mutex_lock(rb->mtx);
@@ -402,6 +436,8 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
assert(idx != -EAGAIN);
+ out:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return idx;
}
@@ -410,7 +446,11 @@ void ssm_rbuff_set_acl(struct ssm_rbuff * rb,
{
assert(rb != NULL);
+ robust_mutex_lock(rb->mtx);
__atomic_store_n(rb->acl, (size_t) flags, __ATOMIC_SEQ_CST);
+ pthread_cond_broadcast(rb->add);
+ pthread_cond_broadcast(rb->del);
+ pthread_mutex_unlock(rb->mtx);
}
uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb)
@@ -424,6 +464,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb)
{
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
robust_mutex_lock(rb->mtx);
pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx);
@@ -432,6 +474,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb)
robust_wait(rb->del, rb->mtx, NULL);
pthread_cleanup_pop(true);
+
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
}
size_t ssm_rbuff_queued(struct ssm_rbuff * rb)
diff --git a/src/lib/ssm/ssm.h.in b/src/lib/ssm/ssm.h.in
index b9246c8b..b86327a1 100644
--- a/src/lib/ssm/ssm.h.in
+++ b/src/lib/ssm/ssm.h.in
@@ -38,7 +38,6 @@
#define SSM_RBUFF_PREFIX "@SSM_RBUFF_PREFIX@"
#define SSM_FLOW_SET_PREFIX "@SSM_FLOW_SET_PREFIX@"
#define SSM_POOL_NAME "@SSM_POOL_NAME@"
-#define SSM_POOL_BLOCKS @SSM_POOL_BLOCKS@
#define SSM_RBUFF_SIZE @SSM_RBUFF_SIZE@
/* Packet buffer space reservation */
@@ -164,6 +163,24 @@ struct _ssm_pool_hdr {
struct _ssm_size_class size_classes[SSM_POOL_MAX_CLASSES];
};
+#define SSM_PK_BUFF_TOTALSPACE (SSM_PK_BUFF_HEADSPACE + SSM_PK_BUFF_TAILSPACE)
+static __inline__ int select_size_class(struct _ssm_pool_hdr * hdr,
+ size_t len)
+{
+ size_t sz;
+ int i;
+
+ sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_TOTALSPACE + len;
+
+ for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
+ struct _ssm_size_class * sc = &hdr->size_classes[i];
+ if (sc->object_size > 0 && sz <= sc->object_size)
+ return i;
+ }
+
+ return -1;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/src/lib/ssm/tests/pool_sharding_test.c b/src/lib/ssm/tests/pool_sharding_test.c
index c53105e3..ec464a92 100644
--- a/src/lib/ssm/tests/pool_sharding_test.c
+++ b/src/lib/ssm/tests/pool_sharding_test.c
@@ -80,19 +80,13 @@ static int test_lazy_distribution(void)
goto fail_pool;
}
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc_idx = i;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- printf(" Class %d: count=%zu\n", i,
+ printf(" Class %d: object_size=%zu count=%zu\n", i,
+ hdr->size_classes[i].object_size,
hdr->size_classes[i].object_count);
}
goto fail_pool;
@@ -137,7 +131,6 @@ static int test_shard_migration(void)
ssize_t off;
int shard_idx;
int sc_idx;
- int i;
TEST_START();
@@ -149,18 +142,11 @@ static int test_shard_migration(void)
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc_idx = i;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
- goto fail;
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
+ goto fail_pool;
}
sc = &hdr->size_classes[sc_idx];
@@ -209,7 +195,6 @@ static int test_fallback_stealing(void)
size_t total_free;
size_t i;
int sc_idx;
- int c;
TEST_START();
@@ -221,18 +206,11 @@ static int test_fallback_stealing(void)
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (c = 0; c < SSM_POOL_MAX_CLASSES; c++) {
- if (hdr->size_classes[c].object_count > 0) {
- sc_idx = c;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
- goto fail;
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
+ goto fail_pool;
}
sc = &hdr->size_classes[sc_idx];
@@ -261,7 +239,7 @@ static int test_fallback_stealing(void)
/* Free them all - they go to local_shard */
for (i = 0; i < total_blocks / 2; i++) {
- size_t off = ssm_pk_buff_get_idx(spbs[i]);
+ size_t off = ssm_pk_buff_get_off(spbs[i]);
if (ssm_pool_remove(pool, off) != 0) {
printf("Remove %zu failed.\n", i);
free(spbs);
@@ -299,7 +277,7 @@ static int test_fallback_stealing(void)
/* Now all allocated blocks are in use again */
/* Cleanup - free all allocated blocks */
for (i = 0; i < total_blocks / 2; i++) {
- size_t off = ssm_pk_buff_get_idx(spbs[i]);
+ size_t off = ssm_pk_buff_get_off(spbs[i]);
ssm_pool_remove(pool, off);
}
@@ -396,20 +374,15 @@ static int test_multiprocess_sharding(void)
/* Verify blocks distributed across shards */
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc = NULL;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc = &hdr->size_classes[i];
- break;
- }
- }
-
- if (sc == NULL) {
- printf("No size classes configured.\n");
+ /* Inspect the class that TEST_SIZE allocations used */
+ i = select_size_class(hdr, TEST_SIZE);
+ if (i < 0) {
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
goto fail_pool;
}
+ sc = &hdr->size_classes[i];
+
/* After children allocate and free, blocks should be in shards
* (though exact distribution depends on PID values)
*/
diff --git a/src/lib/ssm/tests/pool_test.c b/src/lib/ssm/tests/pool_test.c
index 3fc19cd5..0f9db24d 100644
--- a/src/lib/ssm/tests/pool_test.c
+++ b/src/lib/ssm/tests/pool_test.c
@@ -741,14 +741,14 @@ static int test_ssm_pk_buff_operations(void)
memcpy(head, data, dlen);
- tail = ssm_pk_buff_tail_alloc(spb, 32);
+ tail = ssm_pk_buff_push_tail(spb, 32);
if (tail == NULL) {
- printf("Tail_alloc failed.\n");
+ printf("push_tail failed.\n");
goto fail_ops;
}
if (ssm_pk_buff_len(spb) != POOL_256 + 32) {
- printf("Length after tail_alloc: %zu.\n",
+ printf("Length after push_tail: %zu.\n",
ssm_pk_buff_len(spb));
goto fail_ops;
}
@@ -758,14 +758,14 @@ static int test_ssm_pk_buff_operations(void)
goto fail_ops;
}
- tail = ssm_pk_buff_tail_release(spb, 32);
+ tail = ssm_pk_buff_pop_tail(spb, 32);
if (tail == NULL) {
- printf("Tail_release failed.\n");
+ printf("pop_tail failed.\n");
goto fail_ops;
}
if (ssm_pk_buff_len(spb) != POOL_256) {
- printf("Length after tail_release: %zu.\n",
+ printf("Length after pop_tail: %zu.\n",
ssm_pk_buff_len(spb));
goto fail_ops;
}
diff --git a/src/lib/tests/CMakeLists.txt b/src/lib/tests/CMakeLists.txt
index 5a2f2c52..32836589 100644
--- a/src/lib/tests/CMakeLists.txt
+++ b/src/lib/tests/CMakeLists.txt
@@ -10,7 +10,6 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
auth_test_slh_dsa.c
bitmap_test.c
btree_test.c
- crc32_test.c
crypt_test.c
hash_test.c
kex_test.c
@@ -20,6 +19,7 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
sockets_test.c
time_test.c
tpm_test.c
+ tw_test.c
)
add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests})
diff --git a/src/lib/tests/auth_test.c b/src/lib/tests/auth_test.c
index 1a5a87af..0f3ef715 100644
--- a/src/lib/tests/auth_test.c
+++ b/src/lib/tests/auth_test.c
@@ -347,6 +347,59 @@ static int test_verify_crt(void)
return TEST_RC_FAIL;
}
+static int test_verify_crt_missing_root_ca(void)
+{
+ struct auth_ctx * auth;
+ void * _signed_server_crt;
+ void * _im_ca_crt;
+
+ TEST_START();
+
+ auth = auth_create_ctx();
+ if (auth == NULL) {
+ printf("Failed to create auth context.\n");
+ goto fail_create_ctx;
+ }
+
+ if (crypt_load_crt_str(signed_server_crt_ec, &_signed_server_crt) < 0) {
+ printf("Failed to load signed crt from string.\n");
+ goto fail_load_signed;
+ }
+
+ if (crypt_load_crt_str(im_ca_crt_ec, &_im_ca_crt) < 0) {
+ printf("Failed to load intermediate crt from string.\n");
+ goto fail_load_im_ca;
+ }
+
+ /* Add only the intermediate CA - root CA is missing */
+ if (auth_add_crt_to_store(auth, _im_ca_crt) < 0) {
+ printf("Failed to add intermediate ca crt to auth store.\n");
+ goto fail_add;
+ }
+
+ if (auth_verify_crt(auth, _signed_server_crt) == 0) {
+ printf("Verification should fail without root CA.\n");
+ goto fail_add;
+ }
+
+ crypt_free_crt(_im_ca_crt);
+ crypt_free_crt(_signed_server_crt);
+ auth_destroy_ctx(auth);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_add:
+ crypt_free_crt(_im_ca_crt);
+ fail_load_im_ca:
+ crypt_free_crt(_signed_server_crt);
+ fail_load_signed:
+ auth_destroy_ctx(auth);
+ fail_create_ctx:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
int test_auth_sign(void)
{
uint8_t buf[TEST_MSG_SIZE];
@@ -526,6 +579,7 @@ int auth_test(int argc,
ret |= test_crypt_check_pubkey_crt();
ret |= test_store_add();
ret |= test_verify_crt();
+ ret |= test_verify_crt_missing_root_ca();
ret |= test_auth_sign();
ret |= test_auth_bad_signature();
ret |= test_crt_str();
@@ -538,6 +592,7 @@ int auth_test(int argc,
(void) test_crypt_check_pubkey_crt;
(void) test_store_add;
(void) test_verify_crt;
+ (void) test_verify_crt_missing_root_ca;
(void) test_auth_sign;
(void) test_auth_bad_signature;
(void) test_crt_str;
diff --git a/src/lib/tests/hash_test.c b/src/lib/tests/hash_test.c
index e43847e1..451d3c25 100644
--- a/src/lib/tests/hash_test.c
+++ b/src/lib/tests/hash_test.c
@@ -39,6 +39,74 @@ struct vec_entry {
char * out;
};
+static int test_crc8(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "00" },
+ { "123456789", "df" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc;
+ char res[3];
+
+ str_hash(HASH_CRC8, &crc, cur->in);
+
+ sprintf(res, "%02x", crc);
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
+static int test_crc16(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "ffff" },
+ { "123456789", "29b1" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc[2];
+ char res[5];
+
+ str_hash(HASH_CRC16, crc, cur->in);
+
+ sprintf(res, "%02x%02x", crc[0], crc[1]);
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
static int test_crc32(void)
{
int ret = 0;
@@ -74,6 +142,42 @@ static int test_crc32(void)
return ret;
}
+static int test_crc64(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "0000000000000000" },
+ { "123456789", "ae8b14860a799888" },
+ { "0123456789abcdef",
+ "091485ca7018730e" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc[8];
+ char res[17];
+
+ str_hash(HASH_CRC64, crc, cur->in);
+
+ sprintf(res, HASH_FMT64, HASH_VAL64(crc));
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
static int test_md5(void)
{
int ret = 0;
@@ -192,8 +296,14 @@ int hash_test(int argc,
(void) argc;
(void) argv;
+ ret |= test_crc8();
+
+ ret |= test_crc16();
+
ret |= test_crc32();
+ ret |= test_crc64();
+
ret |= test_md5();
ret |= test_sha3();
diff --git a/src/lib/tests/kex_test.c b/src/lib/tests/kex_test.c
index ced760fe..6a4f802e 100644
--- a/src/lib/tests/kex_test.c
+++ b/src/lib/tests/kex_test.c
@@ -106,7 +106,7 @@ static int test_kex_dh_pkp_create_destroy(void)
{
struct sec_config kex;
void * pkp;
- uint8_t buf[MSGBUFSZ];
+ uint8_t buf[CRYPT_KEY_BUFSZ];
TEST_START();
@@ -134,7 +134,7 @@ static int test_kex_get_algo_from_pk(const char * algo)
void * pkp;
buffer_t pk;
ssize_t len;
- uint8_t buf[MSGBUFSZ];
+ uint8_t buf[CRYPT_KEY_BUFSZ];
char extracted_algo[256];
TEST_START("(%s)", algo);
@@ -204,8 +204,8 @@ static int test_kex_dhe_derive(const char * algo)
buffer_t pk1;
buffer_t pk2;
ssize_t len;
- uint8_t buf1[MSGBUFSZ];
- uint8_t buf2[MSGBUFSZ];
+ uint8_t buf1[CRYPT_KEY_BUFSZ];
+ uint8_t buf2[CRYPT_KEY_BUFSZ];
uint8_t s1[SYMMKEYSZ];
uint8_t s2[SYMMKEYSZ];
@@ -317,7 +317,7 @@ static int test_kex_dhe_corrupted_pubkey(const char * algo)
void * pkp;
buffer_t pk;
ssize_t len;
- uint8_t buf[MSGBUFSZ];
+ uint8_t buf[CRYPT_KEY_BUFSZ];
uint8_t s[SYMMKEYSZ];
TEST_START("(%s)", algo);
@@ -363,8 +363,8 @@ static int test_kex_dhe_wrong_algo(void)
void * pkp2;
buffer_t pk2;
ssize_t len;
- uint8_t buf1[MSGBUFSZ];
- uint8_t buf2[MSGBUFSZ];
+ uint8_t buf1[CRYPT_KEY_BUFSZ];
+ uint8_t buf2[CRYPT_KEY_BUFSZ];
uint8_t s[SYMMKEYSZ];
const char * algo1 = "X25519";
const char * algo2 = "X448";
diff --git a/src/lib/tests/kex_test_ml_kem.c b/src/lib/tests/kex_test_ml_kem.c
index 3bb9ae7c..7761c3dc 100644
--- a/src/lib/tests/kex_test_ml_kem.c
+++ b/src/lib/tests/kex_test_ml_kem.c
@@ -197,8 +197,8 @@ static int test_kex_kem(const char * algo)
buffer_t ct;
ssize_t len;
ssize_t ct_len;
- uint8_t buf1[MSGBUFSZ];
- uint8_t buf2[MSGBUFSZ];
+ uint8_t buf1[CRYPT_KEY_BUFSZ];
+ uint8_t buf2[CRYPT_KEY_BUFSZ];
uint8_t s1[SYMMKEYSZ];
uint8_t s2[SYMMKEYSZ];
int kdf;
@@ -262,8 +262,8 @@ static int test_kex_kem_corrupted_ciphertext(const char * algo)
buffer_t ct;
ssize_t len;
ssize_t ct_len;
- uint8_t buf1[MSGBUFSZ];
- uint8_t buf2[MSGBUFSZ];
+ uint8_t buf1[CRYPT_KEY_BUFSZ];
+ uint8_t buf2[CRYPT_KEY_BUFSZ];
uint8_t s1[SYMMKEYSZ];
uint8_t s2[SYMMKEYSZ];
int kdf;
@@ -334,9 +334,9 @@ static int test_kex_kem_wrong_keypair(const char * algo)
buffer_t ct;
ssize_t len;
ssize_t ct_len;
- uint8_t buf1[MSGBUFSZ];
- uint8_t buf2[MSGBUFSZ];
- uint8_t buf3[MSGBUFSZ];
+ uint8_t buf1[CRYPT_KEY_BUFSZ];
+ uint8_t buf2[CRYPT_KEY_BUFSZ];
+ uint8_t buf3[CRYPT_KEY_BUFSZ];
uint8_t s1[SYMMKEYSZ];
uint8_t s2[SYMMKEYSZ];
@@ -402,8 +402,8 @@ static int test_kex_kem_truncated_ciphertext(const char * algo)
buffer_t ct;
ssize_t len;
ssize_t ct_len;
- uint8_t buf1[MSGBUFSZ];
- uint8_t buf2[MSGBUFSZ];
+ uint8_t buf1[CRYPT_KEY_BUFSZ];
+ uint8_t buf2[CRYPT_KEY_BUFSZ];
uint8_t s1[SYMMKEYSZ];
uint8_t s2[SYMMKEYSZ];
diff --git a/src/lib/tests/tpm_test.c b/src/lib/tests/tpm_test.c
index df1d8850..7cc049cd 100644
--- a/src/lib/tests/tpm_test.c
+++ b/src/lib/tests/tpm_test.c
@@ -21,7 +21,7 @@
*/
-#include "tpm.c"
+#include <ouroboros/tpm.h>
#include <test/test.h>
diff --git a/src/lib/tests/tw_test.c b/src/lib/tests/tw_test.c
new file mode 100644
index 00000000..32c302c4
--- /dev/null
+++ b/src/lib/tests/tw_test.c
@@ -0,0 +1,663 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic timing-wheel tests
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#define _DEFAULT_SOURCE
+#else
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "config.h"
+
+#include <test/test.h>
+
+#include <ouroboros/time.h>
+#include <ouroboros/tw.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+
+struct payload {
+ struct tw_entry tw;
+ int fired;
+};
+
+struct cancel_payload {
+ struct tw_entry tw;
+ int fired;
+ struct tw_entry * sibling;
+};
+
+struct repost_payload {
+ struct tw_entry tw;
+ int fired;
+ struct payload * sibling;
+ uint64_t repost_at;
+};
+
+static void cb_count(void * arg)
+{
+ struct payload * p = arg;
+ p->fired++;
+}
+
+static void cb_cancel_sibling(void * arg)
+{
+ struct cancel_payload * p = arg;
+ p->fired++;
+ tw_cancel(p->sibling);
+}
+
+static void cb_repost_sibling(void * arg)
+{
+ struct repost_payload * p = arg;
+ p->fired++;
+ tw_post(&p->sibling->tw, p->repost_at, cb_count, p->sibling);
+}
+
+static uint64_t now_ns(void)
+{
+ struct timespec ts;
+ clock_gettime(PTHREAD_COND_CLOCK, &ts);
+ return TS_TO_UINT64(ts);
+}
+
+static void sleep_ns(uint64_t ns)
+{
+ struct timespec ts;
+ UINT64_TO_TS(ns, &ts);
+ nanosleep(&ts, NULL);
+}
+
+static int test_tw_init_fini(void)
+{
+ TEST_START();
+
+ if (tw_init() < 0) {
+ printf("tw_init failed.\n");
+ goto fail;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_post_fires_after_deadline(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_no_fire_before_deadline(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 100 * MILLION, cb_count, &p);
+
+ sleep_ns(2 * MILLION);
+ tw_move();
+
+ if (p.fired != 0) {
+ printf("expected 0 fires, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_cancel_prevents_fire(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p);
+ tw_cancel(&p.tw);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 0) {
+ printf("cancelled entry fired %d times\n", p.fired);
+ goto fail_init;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_init:
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_cancel_unposted_is_noop(void)
+{
+ struct tw_entry e;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&e);
+ tw_cancel(&e);
+ tw_cancel(&e);
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_fire_only_once(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+ tw_move();
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire, got %d after 3 moves\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Multi-level: post a level-1 (>= 256ms) deadline; should still fire. */
+static int test_tw_post_level1_fires(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 300 * MILLION, cb_count, &p);
+
+ if (p.tw.lvl != 1) {
+ printf("expected level 1 placement, got %zu\n", p.tw.lvl);
+ goto fail_post;
+ }
+
+ sleep_ns(320 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("level-1 entry didn't fire (got %d)\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_many_entries_all_fire(void)
+{
+ struct payload pl[16];
+ size_t i;
+ size_t total = 0;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ for (i = 0; i < 16; ++i) {
+ tw_init_entry(&pl[i].tw);
+ pl[i].fired = 0;
+ tw_post(&pl[i].tw, now_ns() + (1 + i) * MILLION,
+ cb_count, &pl[i]);
+ }
+
+ sleep_ns(40 * MILLION);
+ tw_move();
+
+ for (i = 0; i < 16; ++i)
+ total += pl[i].fired;
+
+ if (total != 16) {
+ printf("expected 16 fires, got %zu\n", total);
+ goto fail_post;
+ }
+
+ for (i = 0; i < 16; ++i)
+ tw_cancel(&pl[i].tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ for (i = 0; i < 16; ++i)
+ tw_cancel(&pl[i].tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* tw_next_expiry signals empty wheel via tv_nsec == -1. */
+static int test_tw_next_expiry_empty(void)
+{
+ struct timespec out;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_next_expiry(&out);
+ if (out.tv_nsec != -1) {
+ printf("expected tv_nsec=-1, got %ld\n", (long) out.tv_nsec);
+ goto fail_init;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_init:
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* tw_next_expiry returns a deadline within the right ballpark. */
+static int test_tw_next_expiry_returns_deadline(void)
+{
+ struct payload p;
+ struct timespec out;
+ uint64_t target;
+ uint64_t out_ns;
+ int64_t skew;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ target = now_ns() + 50 * MILLION;
+ tw_post(&p.tw, target, cb_count, &p);
+
+ tw_next_expiry(&out);
+ out_ns = TS_TO_UINT64(out);
+
+ /* Level-0 quantization gives ±1 slot of skew. */
+ skew = (int64_t)(out_ns) - (int64_t)(target);
+ if (skew < -2 * MILLION || skew > 4 * MILLION) {
+ printf("deadline not in -2..+4 ms, skew=%ld ns\n", (long) skew);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Repost: fire, then post again. */
+static int test_tw_repost_after_fire(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+ sleep_ns(20 * MILLION);
+ tw_move();
+ if (p.fired != 1) {
+ printf("first fire missed\n");
+ goto fail_post;
+ }
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+ sleep_ns(20 * MILLION);
+ tw_move();
+ if (p.fired != 2) {
+ printf("second fire missed (fired=%d)\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Double-post replaces the schedule; only the second fires. */
+static int test_tw_double_post_replaces(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 30 * MILLION, cb_count, &p);
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire after replace, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ sleep_ns(40 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("first schedule fired after replace (got %d)\n",
+ p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Fire callback may safely cancel a sibling in the same slot. */
+static int test_tw_fire_cancels_sibling(void)
+{
+ struct cancel_payload a;
+ struct payload b;
+ uint64_t deadline;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&a.tw);
+ tw_init_entry(&b.tw);
+ a.fired = 0;
+ a.sibling = &b.tw;
+ b.fired = 0;
+
+ deadline = now_ns() + 3 * MILLION;
+ tw_post(&a.tw, deadline, cb_cancel_sibling, &a);
+ tw_post(&b.tw, deadline, cb_count, &b);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (a.fired != 1) {
+ printf("a expected 1 fire, got %d\n", a.fired);
+ goto fail_post;
+ }
+ if (b.fired != 0) {
+ printf("b should not have fired (got %d)\n", b.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Fire callback may safely repost a sibling to a future slot. */
+static int test_tw_fire_posts_sibling(void)
+{
+ struct repost_payload a;
+ struct payload b;
+ uint64_t deadline;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&a.tw);
+ tw_init_entry(&b.tw);
+ a.fired = 0;
+ a.sibling = &b;
+ a.repost_at = now_ns() + 30 * MILLION;
+ b.fired = 0;
+
+ deadline = now_ns() + 3 * MILLION;
+ tw_post(&a.tw, deadline, cb_repost_sibling, &a);
+ tw_post(&b.tw, deadline, cb_count, &b);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (a.fired != 1) {
+ printf("a expected 1 fire, got %d\n", a.fired);
+ goto fail_post;
+ }
+ if (b.fired != 0) {
+ printf("b fired before reposted deadline (got %d)\n",
+ b.fired);
+ goto fail_post;
+ }
+
+ sleep_ns(25 * MILLION);
+ tw_move();
+
+ if (b.fired != 1) {
+ printf("b expected 1 fire after repost, got %d\n",
+ b.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int tw_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_tw_init_fini();
+ ret |= test_tw_post_fires_after_deadline();
+ ret |= test_tw_no_fire_before_deadline();
+ ret |= test_tw_cancel_prevents_fire();
+ ret |= test_tw_cancel_unposted_is_noop();
+ ret |= test_tw_fire_only_once();
+ ret |= test_tw_post_level1_fires();
+ ret |= test_tw_many_entries_all_fire();
+ ret |= test_tw_next_expiry_empty();
+ ret |= test_tw_next_expiry_returns_deadline();
+ ret |= test_tw_repost_after_fire();
+ ret |= test_tw_double_post_replaces();
+ ret |= test_tw_fire_cancels_sibling();
+ ret |= test_tw_fire_posts_sibling();
+
+ return ret;
+}
diff --git a/src/lib/timerwheel.c b/src/lib/timerwheel.c
deleted file mode 100644
index 2c796c96..00000000
--- a/src/lib/timerwheel.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * Ouroboros - Copyright (C) 2016 - 2026
- *
- * Timerwheel
- *
- * Dimitri Staessens <dimitri@ouroboros.rocks>
- * Sander Vrijders <sander@ouroboros.rocks>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * version 2.1 as published by the Free Software Foundation.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., http://www.fsf.org/about/contact/.
- */
-
-#include <ouroboros/list.h>
-
-/* Overflow limits range to about 6 hours. */
-#define ts_to_ns(ts) (ts.tv_sec * BILLION + ts.tv_nsec)
-#define ts_to_rxm_slot(ts) (ts_to_ns(ts) >> RXMQ_RES)
-#define ts_to_ack_slot(ts) (ts_to_ns(ts) >> ACKQ_RES)
-
-struct rxm {
- struct list_head next;
- uint32_t seqno;
-#ifndef RXM_BUFFER_ON_HEAP
- struct ssm_pk_buff * spb;
-#endif
- struct frct_pci * pkt;
- size_t len;
- time_t t0; /* Time when original was sent (us). */
- struct frcti * frcti;
- int fd;
- int flow_id; /* Prevent rtx when fd reused. */
-};
-
-struct ack {
- struct list_head next;
- struct frcti * frcti;
- int fd;
- int flow_id;
-};
-
-struct {
- /*
- * At a 1 ms min resolution, every level bumps the
- * resolution by a factor of 16.
- */
- struct list_head rxms[RXMQ_LVLS][RXMQ_SLOTS];
-
- struct list_head acks[ACKQ_SLOTS];
- bool map[ACKQ_SLOTS][PROG_MAX_FLOWS];
-
- size_t prv_rxm[RXMQ_LVLS]; /* Last processed rxm slots. */
- size_t prv_ack; /* Last processed ack slot. */
- pthread_mutex_t lock;
-} rw;
-
-static void timerwheel_fini(void)
-{
- size_t i;
- size_t j;
- struct list_head * p;
- struct list_head * h;
-
- pthread_mutex_lock(&rw.lock);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- for (j = 0; j < RXMQ_SLOTS; j++) {
- list_for_each_safe(p, h, &rw.rxms[i][j]) {
- struct rxm * rxm;
- rxm = list_entry(p, struct rxm, next);
- list_del(&rxm->next);
-#ifdef RXM_BUFFER_ON_HEAP
- free(rxm->pkt);
-#else
- ssm_pk_buff_ack(rxm->spb);
- ipcp_spb_release(rxm->spb);
-#endif
- free(rxm);
- }
- }
- }
-
- for (i = 0; i < ACKQ_SLOTS; ++i) {
- list_for_each_safe(p, h, &rw.acks[i]) {
- struct ack * a = list_entry(p, struct ack, next);
- list_del(&a->next);
- free(a);
- }
- }
-
- pthread_mutex_unlock(&rw.lock);
-
- pthread_mutex_destroy(&rw.lock);
-}
-
-static int timerwheel_init(void)
-{
- struct timespec now;
- size_t i;
- size_t j;
-
- if (pthread_mutex_init(&rw.lock, NULL))
- return -1;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- rw.prv_rxm[i] = (ts_to_rxm_slot(now) - 1);
- rw.prv_rxm[i] >>= (RXMQ_BUMP * i);
- rw.prv_rxm[i] &= (RXMQ_SLOTS - 1);
- for (j = 0; j < RXMQ_SLOTS; ++j)
- list_head_init(&rw.rxms[i][j]);
- }
-
- rw.prv_ack = (ts_to_ack_slot(now) - 1) & (ACKQ_SLOTS - 1);
- for (i = 0; i < ACKQ_SLOTS; ++i)
- list_head_init(&rw.acks[i]);
-
- return 0;
-}
-
-static void timerwheel_move(void)
-{
- struct timespec now;
- struct list_head * p;
- struct list_head * h;
- size_t rxm_slot;
- size_t ack_slot;
- size_t i;
- size_t j;
-
- pthread_mutex_lock(&rw.lock);
-
- pthread_cleanup_push(__cleanup_mutex_unlock, &rw.lock);
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- rxm_slot = ts_to_rxm_slot(now);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- size_t j_max_slot = rxm_slot & (RXMQ_SLOTS - 1);
- j = rw.prv_rxm[i];
- if (j_max_slot < j)
- j_max_slot += RXMQ_SLOTS;
- while (j++ < j_max_slot) {
- list_for_each_safe(p, h,
- &rw.rxms[i][j & (RXMQ_SLOTS - 1)]) {
- struct rxm * r;
- struct frct_cr * snd_cr;
- struct frct_cr * rcv_cr;
- size_t slot;
- size_t rslot;
- ssize_t idx;
- struct ssm_pk_buff * spb;
- struct frct_pci * pci;
- struct flow * f;
- uint32_t snd_lwe;
- uint32_t rcv_lwe;
- size_t lvl = 0;
-
- r = list_entry(p, struct rxm, next);
-
- list_del(&r->next);
-
- snd_cr = &r->frcti->snd_cr;
- rcv_cr = &r->frcti->rcv_cr;
- f = &proc.flows[r->fd];
-#ifndef RXM_BUFFER_ON_HEAP
- ssm_pk_buff_ack(r->spb);
-#endif
- if (f->frcti == NULL
- || f->info.id != r->flow_id)
- goto cleanup;
-
- pthread_rwlock_rdlock(&r->frcti->lock);
-
- snd_lwe = snd_cr->lwe;
- rcv_lwe = rcv_cr->lwe;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- /* Has been ack'd, remove. */
- if (before(r->seqno, snd_lwe))
- goto cleanup;
-
- /* Check for r-timer expiry. */
- if (ts_to_ns(now) - r->t0 > r->frcti->r)
- goto flow_down;
-
- pthread_rwlock_wrlock(&r->frcti->lock);
-
- if (r->seqno == r->frcti->rttseq) {
- r->frcti->rto +=
- r->frcti->rto >> RTO_DIV;
- r->frcti->probe = false;
- }
-#ifdef PROC_FLOW_STATS
- r->frcti->n_rtx++;
-#endif
- rslot = r->frcti->rto >> RXMQ_RES;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- /* Schedule at least in the next time slot. */
- slot = ts_to_ns(now) >> RXMQ_RES;
-
- while (rslot >= RXMQ_SLOTS) {
- ++lvl;
- rslot >>= RXMQ_BUMP;
- slot >>= RXMQ_BUMP;
- }
-
- if (lvl >= RXMQ_LVLS) /* Can't reschedule */
- goto flow_down;
-
- rslot = (rslot + slot + 1) & (RXMQ_SLOTS - 1);
-#ifdef RXM_BLOCKING
- if (ipcp_spb_reserve(&spb, r->len) < 0)
-#else
- if (ssm_pool_alloc(proc.pool, r->len, NULL,
- &spb) < 0)
-#endif
- goto reschedule; /* rdrbuff full */
-
- pci = (struct frct_pci *) ssm_pk_buff_head(spb);
- memcpy(pci, r->pkt, r->len);
-#ifndef RXM_BUFFER_ON_HEAP
- ipcp_spb_release(r->spb);
- r->spb = spb;
- r->pkt = pci;
- ssm_pk_buff_wait_ack(spb);
-#endif
- idx = ssm_pk_buff_get_idx(spb);
-
- /* Retransmit the copy. */
- pci->ackno = hton32(rcv_lwe);
-#ifdef RXM_BLOCKING
- if (ssm_rbuff_write_b(f->tx_rb, idx, NULL) < 0)
-#else
- if (ssm_rbuff_write(f->tx_rb, idx) < 0)
-#endif
- goto flow_down;
- ssm_flow_set_notify(f->set, f->info.id,
- FLOW_PKT);
- reschedule:
- list_add(&r->next, &rw.rxms[lvl][rslot]);
- continue;
-
- flow_down:
- ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN);
- ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN);
- cleanup:
-#ifdef RXM_BUFFER_ON_HEAP
- free(r->pkt);
-#else
- ipcp_spb_release(r->spb);
-#endif
- free(r);
- }
- }
- rw.prv_rxm[i] = rxm_slot & (RXMQ_SLOTS - 1);
- /* Move up a level in the wheel. */
- rxm_slot >>= RXMQ_BUMP;
- }
-
- ack_slot = ts_to_ack_slot(now) & (ACKQ_SLOTS - 1) ;
-
- j = rw.prv_ack;
-
- if (ack_slot < j)
- ack_slot += ACKQ_SLOTS;
-
- while (j++ < ack_slot) {
- list_for_each_safe(p, h, &rw.acks[j & (ACKQ_SLOTS - 1)]) {
- struct ack * a;
- struct flow * f;
-
- a = list_entry(p, struct ack, next);
-
- list_del(&a->next);
-
- f = &proc.flows[a->fd];
-
- rw.map[j & (ACKQ_SLOTS - 1)][a->fd] = false;
-
- if (f->info.id == a->flow_id && f->frcti != NULL)
- send_frct_pkt(a->frcti);
-
- free(a);
- }
- }
-
- rw.prv_ack = ack_slot & (ACKQ_SLOTS - 1);
-
- pthread_cleanup_pop(true);
-}
-
-static int timerwheel_rxm(struct frcti * frcti,
- uint32_t seqno,
- struct ssm_pk_buff * spb)
-{
- struct timespec now;
- struct rxm * r;
- size_t slot;
- size_t lvl = 0;
- time_t rto_slot;
-
- r = malloc(sizeof(*r));
- if (r == NULL)
- return -ENOMEM;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- r->t0 = ts_to_ns(now);
- r->seqno = seqno;
- r->frcti = frcti;
- r->len = ssm_pk_buff_len(spb);
-#ifdef RXM_BUFFER_ON_HEAP
- r->pkt = malloc(r->len);
- if (r->pkt == NULL) {
- free(r);
- return -ENOMEM;
- }
- memcpy(r->pkt, ssm_pk_buff_head(spb), r->len);
-#else
- r->spb = spb;
- r->pkt = (struct frct_pci *) ssm_pk_buff_head(spb);
-#endif
- pthread_rwlock_rdlock(&r->frcti->lock);
-
- rto_slot = frcti->rto >> RXMQ_RES;
- slot = r->t0 >> RXMQ_RES;
-
- r->fd = frcti->fd;
- r->flow_id = proc.flows[r->fd].info.id;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- while (rto_slot >= RXMQ_SLOTS) {
- ++lvl;
- rto_slot >>= RXMQ_BUMP;
- slot >>= RXMQ_BUMP;
- }
-
- if (lvl >= RXMQ_LVLS) { /* Out of timerwheel range. */
-#ifdef RXM_BUFFER_ON_HEAP
- free(r->pkt);
-#endif
- free(r);
- return -EPERM;
- }
-
- slot = (slot + rto_slot + 1) & (RXMQ_SLOTS - 1);
-
- pthread_mutex_lock(&rw.lock);
-
- list_add_tail(&r->next, &rw.rxms[lvl][slot]);
-#ifndef RXM_BUFFER_ON_HEAP
- ssm_pk_buff_wait_ack(spb);
-#endif
- pthread_mutex_unlock(&rw.lock);
-
- return 0;
-}
-
-static int timerwheel_delayed_ack(int fd,
- struct frcti * frcti)
-{
- struct timespec now;
- struct ack * a;
- size_t slot;
-
- a = malloc(sizeof(*a));
- if (a == NULL)
- return -ENOMEM;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_rdlock(&frcti->lock);
-
- slot = (((ts_to_ns(now) + (TICTIME << 1)) >> ACKQ_RES) + 1)
- & (ACKQ_SLOTS - 1);
-
- pthread_rwlock_unlock(&frcti->lock);
-
- a->fd = fd;
- a->frcti = frcti;
- a->flow_id = proc.flows[fd].info.id;
-
- pthread_mutex_lock(&rw.lock);
-
- if (rw.map[slot][fd]) {
- pthread_mutex_unlock(&rw.lock);
- free(a);
- return 0;
- }
-
- rw.map[slot][fd] = true;
-
- list_add_tail(&a->next, &rw.acks[slot]);
-
- pthread_mutex_unlock(&rw.lock);
-
- return 0;
-}
diff --git a/src/lib/tw.c b/src/lib/tw.c
new file mode 100644
index 00000000..ccde7dd1
--- /dev/null
+++ b/src/lib/tw.c
@@ -0,0 +1,307 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic deadline-ordered callback queue (timing wheel)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#define _DEFAULT_SOURCE
+#else
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "config.h"
+
+#include <ouroboros/list.h>
+#include <ouroboros/pthread.h>
+#include <ouroboros/time.h>
+#include <ouroboros/tw.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/* 3 levels × 256 slots, 1 ms / 16 ms / 256 ms per-slot resolution. */
+#define TW_LVLS 3
+#define TW_SLOTS 256
+#define TW_BUMP 4
+#define TW_RES 20 /* 2^20 ns ≈ 1 ms per slot at level 0. */
+
+#define TW_SLOT(x) ((x) & (TW_SLOTS - 1))
+
+static struct {
+ struct list_head levels[TW_LVLS][TW_SLOTS];
+ size_t prv[TW_LVLS];
+ pthread_mutex_t mtx;
+ pthread_mutex_t move_mtx;
+ bool initialised;
+} tw;
+
+static size_t tw_lvl_res(size_t lvl)
+{
+ return TW_RES + TW_BUMP * lvl;
+}
+
+/* Smallest level whose slot range covers the deadline. */
+static size_t tw_pick_lvl(uint64_t now_ns,
+ uint64_t deadline_ns)
+{
+ uint64_t delta;
+ size_t lvl;
+
+ delta = deadline_ns > now_ns ? deadline_ns - now_ns : 0;
+ lvl = 0;
+
+ while (lvl < TW_LVLS - 1 && (delta >> tw_lvl_res(lvl)) >= TW_SLOTS)
+ ++lvl;
+
+ return lvl;
+}
+
+static size_t tw_slot(uint64_t ns,
+ size_t lvl)
+{
+ return TW_SLOT(ns >> tw_lvl_res(lvl));
+}
+
+int tw_init(void)
+{
+ struct timespec now;
+ size_t i;
+ size_t j;
+
+ assert(!tw.initialised);
+
+ if (pthread_mutex_init(&tw.mtx, NULL))
+ goto fail_mtx;
+
+ if (pthread_mutex_init(&tw.move_mtx, NULL))
+ goto fail_move_mtx;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ tw.prv[i] = TW_SLOT(tw_slot(TS_TO_UINT64(now), i) - 1);
+ for (j = 0; j < TW_SLOTS; ++j)
+ list_head_init(&tw.levels[i][j]);
+ }
+
+ tw.initialised = true;
+
+ return 0;
+
+ fail_move_mtx:
+ pthread_mutex_destroy(&tw.mtx);
+ fail_mtx:
+ return -1;
+}
+
+void tw_fini(void)
+{
+ size_t i;
+ size_t j;
+
+ assert(tw.initialised);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ for (j = 0; j < TW_SLOTS; ++j)
+ assert(list_is_empty(&tw.levels[i][j]));
+ }
+
+ pthread_mutex_destroy(&tw.move_mtx);
+ pthread_mutex_destroy(&tw.mtx);
+
+ tw.initialised = false;
+}
+
+void tw_init_entry(struct tw_entry * e)
+{
+ list_head_init(&e->next);
+
+ e->deadline_ns = 0;
+ e->fire = NULL;
+ e->arg = NULL;
+ e->lvl = 0;
+}
+
+void tw_post(struct tw_entry * e,
+ uint64_t deadline_ns,
+ tw_fire_fn_t fire,
+ void * arg)
+{
+ struct timespec now;
+ size_t lvl;
+ size_t slot;
+
+ assert(tw.initialised);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ lvl = tw_pick_lvl(TS_TO_UINT64(now), deadline_ns);
+ /* +1 so deadline <= slot_start; lands later in slot. */
+ slot = TW_SLOT(tw_slot(deadline_ns, lvl) + 1);
+
+ e->deadline_ns = deadline_ns;
+ e->fire = fire;
+ e->arg = arg;
+ e->lvl = lvl;
+
+ pthread_mutex_lock(&tw.mtx);
+
+ if (!list_is_empty(&e->next))
+ list_del(&e->next);
+
+ list_add_tail(&e->next, &tw.levels[lvl][slot]);
+
+ pthread_mutex_unlock(&tw.mtx);
+}
+
+void tw_cancel(struct tw_entry * e)
+{
+ if (e == NULL)
+ return;
+
+ assert(tw.initialised);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ if (!list_is_empty(&e->next)) {
+ list_del(&e->next);
+ list_head_init(&e->next);
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+}
+
+void tw_move(void)
+{
+ struct timespec now;
+ struct list_head deferred;
+ struct list_head * p;
+ uint64_t now_ns;
+ size_t i;
+ size_t j;
+ size_t cur;
+
+ assert(tw.initialised);
+
+ if (pthread_mutex_trylock(&tw.move_mtx) != 0)
+ return;
+
+ pthread_cleanup_push(__cleanup_mutex_unlock, &tw.move_mtx);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ pthread_cleanup_push(__cleanup_mutex_unlock, &tw.mtx);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ cur = tw_slot(now_ns, i);
+
+ j = tw.prv[i];
+ if (cur < j)
+ cur += TW_SLOTS;
+
+ while (j++ < cur) {
+ size_t s = TW_SLOT(j);
+
+ /* Pop-front so fire may mutate any entry. */
+ list_head_init(&deferred);
+
+ while (!list_is_empty(&tw.levels[i][s])) {
+ struct tw_entry * e;
+ p = tw.levels[i][s].nxt;
+ e = list_entry(p, struct tw_entry, next);
+ list_del(&e->next);
+
+ if (e->deadline_ns > now_ns) {
+ list_add_tail(&e->next, &deferred);
+ continue;
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+ e->fire(e->arg);
+ pthread_mutex_lock(&tw.mtx);
+ }
+
+ while (!list_is_empty(&deferred)) {
+ p = deferred.nxt;
+ list_del(p);
+ list_add_tail(p, &tw.levels[i][s]);
+ }
+ }
+
+ tw.prv[i] = TW_SLOT(cur);
+ }
+
+ pthread_cleanup_pop(true); /* tw.mtx */
+ pthread_cleanup_pop(true); /* tw.move_mtx */
+}
+
+/* Earliest pending deadline at level lvl, INT64_MAX if level is empty. */
+static int64_t tw_lvl_earliest(size_t lvl,
+ uint64_t now_ns)
+{
+ size_t cur = tw_slot(now_ns, lvl);
+ size_t j;
+
+ for (j = 1; j <= TW_SLOTS; ++j) {
+ size_t s = TW_SLOT(cur + j);
+
+ if (list_is_empty(&tw.levels[lvl][s]))
+ continue;
+
+ return (int64_t)(now_ns + ((uint64_t) j << tw_lvl_res(lvl)));
+ }
+
+ return INT64_MAX;
+}
+
+void tw_next_expiry(struct timespec * out)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ int64_t earliest = INT64_MAX;
+ size_t i;
+
+ assert(tw.initialised);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ int64_t dl = tw_lvl_earliest(i, now_ns);
+ if (dl < earliest)
+ earliest = dl;
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+
+ if (earliest == INT64_MAX) {
+ /* Empty wheel: tv_nsec=-1 is an invalid normalised value. */
+ out->tv_sec = 0;
+ out->tv_nsec = -1;
+ } else {
+ UINT64_TO_TS((uint64_t) earliest, out);
+ }
+}