From f280f884021c6ccdc1ec6e488768e968d5e4fa60 Mon Sep 17 00:00:00 2001 From: randomizedcoder Date: Wed, 13 May 2026 09:29:14 -0700 Subject: [PATCH 1/4] io_uring package: ring lifecycle, codec, tests, benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation for the opt-in io_uring path. Replaces the abandoned pkg/io_uring/codec.go scaffolding (which referenced the older iceber/iouring-go library) with a thin wrapper around github.com/randomizedcoder/giouring. Key design points (documented in the plan file): - One Ring per Netlinker. Setup flags are SingleIssuer + DeferTaskrun + CoopTaskrun for the "lighter on the system" profile a periodic netlink poller wants. SQPOLL is explicitly NOT used (would burn a CPU per ring for a 1Hz workload). - 64-bit userdata layout: bits 63..56 Operation (uint8), bits 31..0 RequestID (uint32). NsID dropped — the per-Netlinker ring already implies the netns. - Probe at init: refuse to enable io_uring if the kernel doesn't support OpRecvmsg / OpSend / OpWritev (which corresponds to Linux 6.1+ given the chosen setup flags). - In-flight map pins pool buffers from submit until CQE arrives. Capped at 2× SQ entries to blow up loudly if SQEs are submitted faster than CQEs drained. - Writev iovec is heap-allocated (not stack), so the kernel can still read it when the CQE fires. Tests cover single/multi-record recv, single/batched send, writev for SOCK_STREAM unix framing, in-flight-cap enforcement, and teardown draining. socketpair(AF_UNIX, SOCK_DGRAM) gives us datagram boundaries without root. Benchmarks measure the syscall vs io_uring A/B for both directions across batch sizes 1/16/64/256. At batch=256 on a socketpair workload, io_uring shows ~3× fewer voluntary context switches per record than the syscall baseline — the headline "lighter on the system" win. Real syscall-count reduction will be more visible in the end-to-end netlink benchmark that lands with the Netlinker wiring. Three new XtcpConfig proto fields: bool io_uring, uint32 io_uring_recv_batch_size (default 64), uint32 io_uring_cqe_batch_size (default 128). Bindings regenerated via buf generate. Co-Authored-By: Claude Opus 4.7 --- dart/xtcp_config/v1/xtcp_config.pb.dart | 53 +++ dart/xtcp_config/v1/xtcp_config.pbjson.dart | 12 +- gen/xtcp_config/v1/xtcp_config.pb.cc | 177 ++++++--- gen/xtcp_config/v1/xtcp_config.pb.h | 152 ++++++-- gen/xtcp_config/v1/xtcp_config.pb.validate.cc | 53 ++- go.mod | 11 +- go.sum | 90 +---- pkg/io_uring/bench_test.go | 307 +++++++++++++++ pkg/io_uring/codec.go | 60 ++- pkg/io_uring/codec_test.go | 80 ++-- pkg/io_uring/ring.go | 366 ++++++++++++++++++ pkg/io_uring/ring_test.go | 360 +++++++++++++++++ pkg/xtcp_config/xtcp_config.pb.go | 48 ++- proto/xtcp_config/v1/xtcp_config.proto | 29 ++ python/xtcp_config/v1/xtcp_config_pb2.py | 22 +- python/xtcp_config/v1/xtcp_config_pb2.pyi | 10 +- xtcp_config/v1/xtcp_config.swagger.json | 14 + 17 files changed, 1605 insertions(+), 239 deletions(-) create mode 100644 pkg/io_uring/bench_test.go create mode 100644 pkg/io_uring/ring.go create mode 100644 pkg/io_uring/ring_test.go diff --git a/dart/xtcp_config/v1/xtcp_config.pb.dart b/dart/xtcp_config/v1/xtcp_config.pb.dart index 7fbeba2..06866a1 100644 --- a/dart/xtcp_config/v1/xtcp_config.pb.dart +++ b/dart/xtcp_config/v1/xtcp_config.pb.dart @@ -358,6 +358,9 @@ class XtcpConfig extends $pb.GeneratedMessage { $core.String? tag, $core.int? grpcPort, EnabledDeserializers? enabledDeserializers, + $core.bool? ioUring, + $core.int? ioUringRecvBatchSize, + $core.int? ioUringCqeBatchSize, }) { final $result = create(); if (nlTimeoutMilliseconds != null) { @@ -435,6 +438,15 @@ class XtcpConfig extends $pb.GeneratedMessage { if (enabledDeserializers != null) { $result.enabledDeserializers = enabledDeserializers; } + if (ioUring != null) { + $result.ioUring = ioUring; + } + if (ioUringRecvBatchSize != null) { + $result.ioUringRecvBatchSize = ioUringRecvBatchSize; + } + if (ioUringCqeBatchSize != null) { + $result.ioUringCqeBatchSize = ioUringCqeBatchSize; + } return $result; } XtcpConfig._() : super(); @@ -467,6 +479,9 @@ class XtcpConfig extends $pb.GeneratedMessage { ..aOS(180, _omitFieldNames ? '' : 'tag') ..a<$core.int>(190, _omitFieldNames ? '' : 'grpcPort', $pb.PbFieldType.OU3) ..aOM(200, _omitFieldNames ? '' : 'enabledDeserializers', subBuilder: EnabledDeserializers.create) + ..aOB(210, _omitFieldNames ? '' : 'ioUring') + ..a<$core.int>(211, _omitFieldNames ? '' : 'ioUringRecvBatchSize', $pb.PbFieldType.OU3) + ..a<$core.int>(212, _omitFieldNames ? '' : 'ioUringCqeBatchSize', $pb.PbFieldType.OU3) ..hasRequiredFields = false ; @@ -768,6 +783,44 @@ class XtcpConfig extends $pb.GeneratedMessage { void clearEnabledDeserializers() => clearField(200); @$pb.TagNumber(200) EnabledDeserializers ensureEnabledDeserializers() => $_ensure(24); + + /// When true, route netlink reads and raw-socket destination writes + /// through an io_uring ring per Netlinker. Requires Linux 6.1+. + /// Library-backed destinations (kafka, nsq, nats, valkey) ignore this + /// flag — they continue to use their own client sockets unchanged. + @$pb.TagNumber(210) + $core.bool get ioUring => $_getBF(25); + @$pb.TagNumber(210) + set ioUring($core.bool v) { $_setBool(25, v); } + @$pb.TagNumber(210) + $core.bool hasIoUring() => $_has(25); + @$pb.TagNumber(210) + void clearIoUring() => clearField(210); + + /// Number of recvmsg SQEs kept in flight per Netlinker ring. Higher + /// values reduce io_uring_enter syscalls per dump cycle on hosts with + /// many sockets, at the cost of more pinned buffers from packet pool. + /// Ignored unless io_uring=true. Default 64. + @$pb.TagNumber(211) + $core.int get ioUringRecvBatchSize => $_getIZ(26); + @$pb.TagNumber(211) + set ioUringRecvBatchSize($core.int v) { $_setUnsignedInt32(26, v); } + @$pb.TagNumber(211) + $core.bool hasIoUringRecvBatchSize() => $_has(26); + @$pb.TagNumber(211) + void clearIoUringRecvBatchSize() => clearField(211); + + /// Maximum CQEs reaped per PeekBatchCQE call. Larger batches amortise + /// userland loop overhead but increase scheduling latency for the + /// netlinker goroutine. Ignored unless io_uring=true. Default 128. + @$pb.TagNumber(212) + $core.int get ioUringCqeBatchSize => $_getIZ(27); + @$pb.TagNumber(212) + set ioUringCqeBatchSize($core.int v) { $_setUnsignedInt32(27, v); } + @$pb.TagNumber(212) + $core.bool hasIoUringCqeBatchSize() => $_has(27); + @$pb.TagNumber(212) + void clearIoUringCqeBatchSize() => clearField(212); } class EnabledDeserializers extends $pb.GeneratedMessage { diff --git a/dart/xtcp_config/v1/xtcp_config.pbjson.dart b/dart/xtcp_config/v1/xtcp_config.pbjson.dart index 765420a..183f051 100644 --- a/dart/xtcp_config/v1/xtcp_config.pbjson.dart +++ b/dart/xtcp_config/v1/xtcp_config.pbjson.dart @@ -122,6 +122,9 @@ const XtcpConfig$json = { {'1': 'tag', '3': 180, '4': 1, '5': 9, '8': {}, '10': 'tag'}, {'1': 'grpc_port', '3': 190, '4': 1, '5': 13, '8': {}, '10': 'grpcPort'}, {'1': 'enabled_deserializers', '3': 200, '4': 1, '5': 11, '6': '.xtcp_config.v1.EnabledDeserializers', '8': {}, '10': 'enabledDeserializers'}, + {'1': 'io_uring', '3': 210, '4': 1, '5': 8, '8': {}, '10': 'ioUring'}, + {'1': 'io_uring_recv_batch_size', '3': 211, '4': 1, '5': 13, '8': {}, '10': 'ioUringRecvBatchSize'}, + {'1': 'io_uring_cqe_batch_size', '3': 212, '4': 1, '5': 13, '8': {}, '10': 'ioUringCqeBatchSize'}, ], '7': {}, }; @@ -153,9 +156,12 @@ final $typed_data.Uint8List xtcpConfigDescriptor = $convert.base64Decode( 'qgEgASgJQgq6SAfIAQByAhgoUgVsYWJlbBIdCgN0YWcYtAEgASgJQgq6SAfIAQByAhgoUgN0YW' 'cSLAoJZ3JwY19wb3J0GL4BIAEoDUIOukgLyAEBKgYY//8DKAFSCGdycGNQb3J0EmIKFWVuYWJs' 'ZWRfZGVzZXJpYWxpemVycxjIASABKAsyJC54dGNwX2NvbmZpZy52MS5FbmFibGVkRGVzZXJpYW' - 'xpemVyc0IGukgDyAEAUhRlbmFibGVkRGVzZXJpYWxpemVyczpzukhwGm4KD1h0Y3BDb25maWcu' - 'cG9sbBIyUG9sbCB0aW1lb3V0IG11c3QgYmUgbGVzcyB0aGFuIHBvbGwgcG9sbF9mcmVxdWVuY3' - 'kaJ3RoaXMucG9sbF9mcmVxdWVuY3kgPiB0aGlzLnBvbGxfdGltZW91dA=='); + 'xpemVyc0IGukgDyAEAUhRlbmFibGVkRGVzZXJpYWxpemVycxIiCghpb191cmluZxjSASABKAhC' + 'BrpIA8gBAFIHaW9VcmluZxJGChhpb191cmluZ19yZWN2X2JhdGNoX3NpemUY0wEgASgNQg26SA' + 'rIAQAqBRiAICgBUhRpb1VyaW5nUmVjdkJhdGNoU2l6ZRJEChdpb191cmluZ19jcWVfYmF0Y2hf' + 'c2l6ZRjUASABKA1CDbpICsgBACoFGIAgKAFSE2lvVXJpbmdDcWVCYXRjaFNpemU6c7pIcBpuCg' + '9YdGNwQ29uZmlnLnBvbGwSMlBvbGwgdGltZW91dCBtdXN0IGJlIGxlc3MgdGhhbiBwb2xsIHBv' + 'bGxfZnJlcXVlbmN5Gid0aGlzLnBvbGxfZnJlcXVlbmN5ID4gdGhpcy5wb2xsX3RpbWVvdXQ='); @$core.Deprecated('Use enabledDeserializersDescriptor instead') const EnabledDeserializers$json = { diff --git a/gen/xtcp_config/v1/xtcp_config.pb.cc b/gen/xtcp_config/v1/xtcp_config.pb.cc index 8e2c247..3d38ab8 100644 --- a/gen/xtcp_config/v1/xtcp_config.pb.cc +++ b/gen/xtcp_config/v1/xtcp_config.pb.cc @@ -152,12 +152,15 @@ inline constexpr XtcpConfig::Impl_::Impl_( packet_size_{::uint64_t{0u}}, nlmsg_seq_{0u}, packet_size_mply_{0u}, - write_files_{0u}, - protobuf_list_length_delimit_{false}, modulus_{::uint64_t{0u}}, + write_files_{0u}, dest_write_files_{0u}, debug_level_{0u}, - grpc_port_{0u} {} + protobuf_list_length_delimit_{false}, + io_uring_{false}, + grpc_port_{0u}, + io_uring_recv_batch_size_{0u}, + io_uring_cqe_batch_size_{0u} {} template PROTOBUF_CONSTEXPR XtcpConfig::XtcpConfig(::_pbi::ConstantInitialized) @@ -380,6 +383,9 @@ const ::uint32_t PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::XtcpConfig, _impl_.tag_), PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::XtcpConfig, _impl_.grpc_port_), PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::XtcpConfig, _impl_.enabled_deserializers_), + PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::XtcpConfig, _impl_.io_uring_), + PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::XtcpConfig, _impl_.io_uring_recv_batch_size_), + PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::XtcpConfig, _impl_.io_uring_cqe_batch_size_), ~0u, 0, 1, @@ -405,6 +411,9 @@ const ::uint32_t ~0u, ~0u, 3, + ~0u, + ~0u, + ~0u, PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::EnabledDeserializers_EnabledEntry_DoNotUse, _impl_._has_bits_), PROTOBUF_FIELD_OFFSET(::xtcp_config::v1::EnabledDeserializers_EnabledEntry_DoNotUse, _internal_metadata_), ~0u, // no _extensions_ @@ -436,9 +445,9 @@ static const ::_pbi::MigrationSchema {28, 37, -1, sizeof(::xtcp_config::v1::SetResponse)}, {38, 48, -1, sizeof(::xtcp_config::v1::SetPollFrequencyRequest)}, {50, 59, -1, sizeof(::xtcp_config::v1::SetPollFrequencyResponse)}, - {60, 93, -1, sizeof(::xtcp_config::v1::XtcpConfig)}, - {118, 128, -1, sizeof(::xtcp_config::v1::EnabledDeserializers_EnabledEntry_DoNotUse)}, - {130, -1, -1, sizeof(::xtcp_config::v1::EnabledDeserializers)}, + {60, 96, -1, sizeof(::xtcp_config::v1::XtcpConfig)}, + {124, 134, -1, sizeof(::xtcp_config::v1::EnabledDeserializers_EnabledEntry_DoNotUse)}, + {136, -1, -1, sizeof(::xtcp_config::v1::EnabledDeserializers)}, }; static const ::_pb::Message* const file_default_instances[] = { &::xtcp_config::v1::_GetRequest_default_instance_._instance, @@ -471,7 +480,7 @@ const char descriptor_table_protodef_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2eproto[ " than poll poll_frequency\032\'this.poll_tim" "eout < this.poll_frequency\"N\n\030SetPollFre" "quencyResponse\0222\n\006config\030\001 \001(\0132\032.xtcp_co" - "nfig.v1.XtcpConfigR\006config\"\204\014\n\nXtcpConfi" + "nfig.v1.XtcpConfigR\006config\"\266\r\n\nXtcpConfi" "g\022F\n\027nl_timeout_milliseconds\030\n \001(\004B\016\272H\0132" "\006\030\240\215\006(\000\310\001\001R\025nlTimeoutMilliseconds\022S\n\016pol" "l_frequency\030\024 \001(\0132\031.google.protobuf.Dura" @@ -507,26 +516,31 @@ const char descriptor_table_protodef_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2eproto[ "\276\001 \001(\rB\016\272H\013*\006\030\377\377\003(\001\310\001\001R\010grpcPort\022b\n\025enab" "led_deserializers\030\310\001 \001(\0132$.xtcp_config.v" "1.EnabledDeserializersB\006\272H\003\310\001\000R\024enabledD" - "eserializers:s\272Hp\032n\n\017XtcpConfig.poll\0222Po" - "ll timeout must be less than poll poll_f" - "requency\032\'this.poll_frequency > this.pol" - "l_timeout\"\237\001\n\024EnabledDeserializers\022K\n\007en" - "abled\030\001 \003(\01321.xtcp_config.v1.EnabledDese" - "rializers.EnabledEntryR\007enabled\032:\n\014Enabl" - "edEntry\022\020\n\003key\030\001 \001(\tR\003key\022\024\n\005value\030\002 \001(\010" - "R\005value:\0028\0012\341\002\n\rConfigService\022]\n\003Get\022\032.x" - "tcp_config.v1.GetRequest\032\033.xtcp_config.v" - "1.GetResponse\"\035\202\323\344\223\002\027\032\022/ConfigService/Ge" - "t:\001*\022]\n\003Set\022\032.xtcp_config.v1.SetRequest\032" - "\033.xtcp_config.v1.SetResponse\"\035\202\323\344\223\002\027\032\022/C" - "onfigService/Set:\001*\022\221\001\n\020SetPollFrequency" - "\022\'.xtcp_config.v1.SetPollFrequencyReques" - "t\032(.xtcp_config.v1.SetPollFrequencyRespo" - "nse\"*\202\323\344\223\002$\032\037/ConfigService/SetPollFrequ" - "ency:\001*B\215\001\n\022com.xtcp_config.v1B\017XtcpConf" - "igProtoP\001Z\021./pkg/xtcp_config\242\002\003XXX\252\002\rXtc" - "pConfig.V1\312\002\rXtcpConfig\\V1\342\002\031XtcpConfig\\" - "V1\\GPBMetadata\352\002\016XtcpConfig::V1b\006proto3" + "eserializers\022\"\n\010io_uring\030\322\001 \001(\010B\006\272H\003\310\001\000R" + "\007ioUring\022F\n\030io_uring_recv_batch_size\030\323\001 " + "\001(\rB\r\272H\n*\005\030\200 (\001\310\001\000R\024ioUringRecvBatchSize" + "\022D\n\027io_uring_cqe_batch_size\030\324\001 \001(\rB\r\272H\n*" + "\005\030\200 (\001\310\001\000R\023ioUringCqeBatchSize:s\272Hp\032n\n\017X" + "tcpConfig.poll\0222Poll timeout must be les" + "s than poll poll_frequency\032\'this.poll_fr" + "equency > this.poll_timeout\"\237\001\n\024EnabledD" + "eserializers\022K\n\007enabled\030\001 \003(\01321.xtcp_con" + "fig.v1.EnabledDeserializers.EnabledEntry" + "R\007enabled\032:\n\014EnabledEntry\022\020\n\003key\030\001 \001(\tR\003" + "key\022\024\n\005value\030\002 \001(\010R\005value:\0028\0012\341\002\n\rConfig" + "Service\022]\n\003Get\022\032.xtcp_config.v1.GetReque" + "st\032\033.xtcp_config.v1.GetResponse\"\035\202\323\344\223\002\027\032" + "\022/ConfigService/Get:\001*\022]\n\003Set\022\032.xtcp_con" + "fig.v1.SetRequest\032\033.xtcp_config.v1.SetRe" + "sponse\"\035\202\323\344\223\002\027\032\022/ConfigService/Set:\001*\022\221\001" + "\n\020SetPollFrequency\022\'.xtcp_config.v1.SetP" + "ollFrequencyRequest\032(.xtcp_config.v1.Set" + "PollFrequencyResponse\"*\202\323\344\223\002$\032\037/ConfigSe" + "rvice/SetPollFrequency:\001*B\215\001\n\022com.xtcp_c" + "onfig.v1B\017XtcpConfigProtoP\001Z\021./pkg/xtcp_" + "config\242\002\003XXX\252\002\rXtcpConfig.V1\312\002\rXtcpConfi" + "g\\V1\342\002\031XtcpConfig\\V1\\GPBMetadata\352\002\016XtcpC" + "onfig::V1b\006proto3" }; static const ::_pbi::DescriptorTable* const descriptor_table_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2eproto_deps[3] = { @@ -538,7 +552,7 @@ static ::absl::once_flag descriptor_table_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2ep PROTOBUF_CONSTINIT const ::_pbi::DescriptorTable descriptor_table_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2eproto = { false, false, - 2959, + 3137, descriptor_table_protodef_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2eproto, "xtcp_config/v1/xtcp_config.proto", &descriptor_table_xtcp_5fconfig_2fv1_2fxtcp_5fconfig_2eproto_once, @@ -2047,9 +2061,9 @@ XtcpConfig::XtcpConfig( offsetof(Impl_, nl_timeout_milliseconds_), reinterpret_cast(&from._impl_) + offsetof(Impl_, nl_timeout_milliseconds_), - offsetof(Impl_, grpc_port_) - + offsetof(Impl_, io_uring_cqe_batch_size_) - offsetof(Impl_, nl_timeout_milliseconds_) + - sizeof(Impl_::grpc_port_)); + sizeof(Impl_::io_uring_cqe_batch_size_)); // @@protoc_insertion_point(copy_constructor:xtcp_config.v1.XtcpConfig) } @@ -2071,9 +2085,9 @@ inline void XtcpConfig::SharedCtor(::_pb::Arena* arena) { ::memset(reinterpret_cast(&_impl_) + offsetof(Impl_, poll_frequency_), 0, - offsetof(Impl_, grpc_port_) - + offsetof(Impl_, io_uring_cqe_batch_size_) - offsetof(Impl_, poll_frequency_) + - sizeof(Impl_::grpc_port_)); + sizeof(Impl_::io_uring_cqe_batch_size_)); } XtcpConfig::~XtcpConfig() { // @@protoc_insertion_point(destructor:xtcp_config.v1.XtcpConfig) @@ -2134,15 +2148,15 @@ const ::google::protobuf::internal::ClassData* XtcpConfig::GetClassData() const return _class_data_.base(); } PROTOBUF_CONSTINIT PROTOBUF_ATTRIBUTE_INIT_PRIORITY1 -const ::_pbi::TcParseTable<5, 25, 4, 128, 27> XtcpConfig::_table_ = { +const ::_pbi::TcParseTable<5, 28, 4, 128, 27> XtcpConfig::_table_ = { { PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_._has_bits_), 0, // no _extensions_ - 200, 248, // max_field_number, fast_idx_mask + 212, 248, // max_field_number, fast_idx_mask offsetof(decltype(_table_), field_lookup_table), 3757571583, // skipmap offsetof(decltype(_table_), field_entries), - 25, // num_field_entries + 28, // num_field_entries 4, // num_aux_entries offsetof(decltype(_table_), aux_entries), _class_data_.base(), @@ -2215,7 +2229,7 @@ const ::_pbi::TcParseTable<5, 25, 4, 128, 27> XtcpConfig::_table_ = { }}, {{ 40, 0, 11, 62462, 3, 49135, 6, 65279, 8, 61435, 9, 65471, 11, 31740, 12, - 48495, 16, 65279, 20, 61435, 21, 65471, 23, 65534, 24, + 48495, 16, 65279, 20, 61435, 21, 65471, 23, 58366, 24, 65535, 65535 }}, {{ // uint64 nl_timeout_milliseconds = 10 [json_name = "nlTimeoutMilliseconds", (.buf.validate.field) = { @@ -2293,6 +2307,15 @@ const ::_pbi::TcParseTable<5, 25, 4, 128, 27> XtcpConfig::_table_ = { // .xtcp_config.v1.EnabledDeserializers enabled_deserializers = 200 [json_name = "enabledDeserializers", (.buf.validate.field) = { {PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.enabled_deserializers_), _Internal::kHasBitsOffset + 3, 3, (0 | ::_fl::kFcOptional | ::_fl::kMessage | ::_fl::kTvTable)}, + // bool io_uring = 210 [json_name = "ioUring", (.buf.validate.field) = { + {PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.io_uring_), -1, 0, + (0 | ::_fl::kFcSingular | ::_fl::kBool)}, + // uint32 io_uring_recv_batch_size = 211 [json_name = "ioUringRecvBatchSize", (.buf.validate.field) = { + {PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.io_uring_recv_batch_size_), -1, 0, + (0 | ::_fl::kFcSingular | ::_fl::kUInt32)}, + // uint32 io_uring_cqe_batch_size = 212 [json_name = "ioUringCqeBatchSize", (.buf.validate.field) = { + {PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.io_uring_cqe_batch_size_), -1, 0, + (0 | ::_fl::kFcSingular | ::_fl::kUInt32)}, }}, {{ {::_pbi::TcParser::GetTable<::google::protobuf::Duration>()}, {::_pbi::TcParser::GetTable<::google::protobuf::Duration>()}, @@ -2347,8 +2370,8 @@ PROTOBUF_NOINLINE void XtcpConfig::Clear() { } } ::memset(&_impl_.nl_timeout_milliseconds_, 0, static_cast<::size_t>( - reinterpret_cast(&_impl_.grpc_port_) - - reinterpret_cast(&_impl_.nl_timeout_milliseconds_)) + sizeof(_impl_.grpc_port_)); + reinterpret_cast(&_impl_.io_uring_cqe_batch_size_) - + reinterpret_cast(&_impl_.nl_timeout_milliseconds_)) + sizeof(_impl_.io_uring_cqe_batch_size_)); _impl_._has_bits_.Clear(); _internal_metadata_.Clear<::google::protobuf::UnknownFieldSet>(); } @@ -2552,6 +2575,27 @@ PROTOBUF_NOINLINE void XtcpConfig::Clear() { stream); } + // bool io_uring = 210 [json_name = "ioUring", (.buf.validate.field) = { + if (this_._internal_io_uring() != 0) { + target = stream->EnsureSpace(target); + target = ::_pbi::WireFormatLite::WriteBoolToArray( + 210, this_._internal_io_uring(), target); + } + + // uint32 io_uring_recv_batch_size = 211 [json_name = "ioUringRecvBatchSize", (.buf.validate.field) = { + if (this_._internal_io_uring_recv_batch_size() != 0) { + target = stream->EnsureSpace(target); + target = ::_pbi::WireFormatLite::WriteUInt32ToArray( + 211, this_._internal_io_uring_recv_batch_size(), target); + } + + // uint32 io_uring_cqe_batch_size = 212 [json_name = "ioUringCqeBatchSize", (.buf.validate.field) = { + if (this_._internal_io_uring_cqe_batch_size() != 0) { + target = stream->EnsureSpace(target); + target = ::_pbi::WireFormatLite::WriteUInt32ToArray( + 212, this_._internal_io_uring_cqe_batch_size(), target); + } + if (PROTOBUF_PREDICT_FALSE(this_._internal_metadata_.have_unknown_fields())) { target = ::_pbi::WireFormat::InternalSerializeUnknownFieldsToArray( @@ -2677,20 +2721,16 @@ PROTOBUF_NOINLINE void XtcpConfig::Clear() { total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( this_._internal_packet_size_mply()); } - // uint32 write_files = 90 [json_name = "writeFiles", (.buf.validate.field) = { - if (this_._internal_write_files() != 0) { - total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( - this_._internal_write_files()); - } - // bool protobuf_list_length_delimit = 121 [json_name = "protobufListLengthDelimit", (.buf.validate.field) = { - if (this_._internal_protobuf_list_length_delimit() != 0) { - total_size += 3; - } // uint64 modulus = 110 [json_name = "modulus", (.buf.validate.field) = { if (this_._internal_modulus() != 0) { total_size += 2 + ::_pbi::WireFormatLite::UInt64Size( this_._internal_modulus()); } + // uint32 write_files = 90 [json_name = "writeFiles", (.buf.validate.field) = { + if (this_._internal_write_files() != 0) { + total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( + this_._internal_write_files()); + } // uint32 dest_write_files = 135 [json_name = "destWriteFiles", (.buf.validate.field) = { if (this_._internal_dest_write_files() != 0) { total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( @@ -2701,11 +2741,29 @@ PROTOBUF_NOINLINE void XtcpConfig::Clear() { total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( this_._internal_debug_level()); } + // bool protobuf_list_length_delimit = 121 [json_name = "protobufListLengthDelimit", (.buf.validate.field) = { + if (this_._internal_protobuf_list_length_delimit() != 0) { + total_size += 3; + } + // bool io_uring = 210 [json_name = "ioUring", (.buf.validate.field) = { + if (this_._internal_io_uring() != 0) { + total_size += 3; + } // uint32 grpc_port = 190 [json_name = "grpcPort", (.buf.validate.field) = { if (this_._internal_grpc_port() != 0) { total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( this_._internal_grpc_port()); } + // uint32 io_uring_recv_batch_size = 211 [json_name = "ioUringRecvBatchSize", (.buf.validate.field) = { + if (this_._internal_io_uring_recv_batch_size() != 0) { + total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( + this_._internal_io_uring_recv_batch_size()); + } + // uint32 io_uring_cqe_batch_size = 212 [json_name = "ioUringCqeBatchSize", (.buf.validate.field) = { + if (this_._internal_io_uring_cqe_batch_size() != 0) { + total_size += 2 + ::_pbi::WireFormatLite::UInt32Size( + this_._internal_io_uring_cqe_batch_size()); + } } return this_.MaybeComputeUnknownFieldsSize(total_size, &this_._impl_._cached_size_); @@ -2804,24 +2862,33 @@ void XtcpConfig::MergeImpl(::google::protobuf::MessageLite& to_msg, const ::goog if (from._internal_packet_size_mply() != 0) { _this->_impl_.packet_size_mply_ = from._impl_.packet_size_mply_; } - if (from._internal_write_files() != 0) { - _this->_impl_.write_files_ = from._impl_.write_files_; - } - if (from._internal_protobuf_list_length_delimit() != 0) { - _this->_impl_.protobuf_list_length_delimit_ = from._impl_.protobuf_list_length_delimit_; - } if (from._internal_modulus() != 0) { _this->_impl_.modulus_ = from._impl_.modulus_; } + if (from._internal_write_files() != 0) { + _this->_impl_.write_files_ = from._impl_.write_files_; + } if (from._internal_dest_write_files() != 0) { _this->_impl_.dest_write_files_ = from._impl_.dest_write_files_; } if (from._internal_debug_level() != 0) { _this->_impl_.debug_level_ = from._impl_.debug_level_; } + if (from._internal_protobuf_list_length_delimit() != 0) { + _this->_impl_.protobuf_list_length_delimit_ = from._impl_.protobuf_list_length_delimit_; + } + if (from._internal_io_uring() != 0) { + _this->_impl_.io_uring_ = from._impl_.io_uring_; + } if (from._internal_grpc_port() != 0) { _this->_impl_.grpc_port_ = from._impl_.grpc_port_; } + if (from._internal_io_uring_recv_batch_size() != 0) { + _this->_impl_.io_uring_recv_batch_size_ = from._impl_.io_uring_recv_batch_size_; + } + if (from._internal_io_uring_cqe_batch_size() != 0) { + _this->_impl_.io_uring_cqe_batch_size_ = from._impl_.io_uring_cqe_batch_size_; + } _this->_impl_._has_bits_[0] |= cached_has_bits; _this->_internal_metadata_.MergeFrom<::google::protobuf::UnknownFieldSet>(from._internal_metadata_); } @@ -2849,8 +2916,8 @@ void XtcpConfig::InternalSwap(XtcpConfig* PROTOBUF_RESTRICT other) { ::_pbi::ArenaStringPtr::InternalSwap(&_impl_.label_, &other->_impl_.label_, arena); ::_pbi::ArenaStringPtr::InternalSwap(&_impl_.tag_, &other->_impl_.tag_, arena); ::google::protobuf::internal::memswap< - PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.grpc_port_) - + sizeof(XtcpConfig::_impl_.grpc_port_) + PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.io_uring_cqe_batch_size_) + + sizeof(XtcpConfig::_impl_.io_uring_cqe_batch_size_) - PROTOBUF_FIELD_OFFSET(XtcpConfig, _impl_.poll_frequency_)>( reinterpret_cast(&_impl_.poll_frequency_), reinterpret_cast(&other->_impl_.poll_frequency_)); diff --git a/gen/xtcp_config/v1/xtcp_config.pb.h b/gen/xtcp_config/v1/xtcp_config.pb.h index d18aed4..04d27df 100644 --- a/gen/xtcp_config/v1/xtcp_config.pb.h +++ b/gen/xtcp_config/v1/xtcp_config.pb.h @@ -862,12 +862,15 @@ class XtcpConfig final : public ::google::protobuf::Message kPacketSizeFieldNumber = 70, kNlmsgSeqFieldNumber = 60, kPacketSizeMplyFieldNumber = 80, - kWriteFilesFieldNumber = 90, - kProtobufListLengthDelimitFieldNumber = 121, kModulusFieldNumber = 110, + kWriteFilesFieldNumber = 90, kDestWriteFilesFieldNumber = 135, kDebugLevelFieldNumber = 160, + kProtobufListLengthDelimitFieldNumber = 121, + kIoUringFieldNumber = 210, kGrpcPortFieldNumber = 190, + kIoUringRecvBatchSizeFieldNumber = 211, + kIoUringCqeBatchSizeFieldNumber = 212, }; // string capture_path = 100 [json_name = "capturePath", (.buf.validate.field) = { void clear_capture_path() ; @@ -1126,26 +1129,6 @@ class XtcpConfig final : public ::google::protobuf::Message ::uint32_t _internal_packet_size_mply() const; void _internal_set_packet_size_mply(::uint32_t value); - public: - // uint32 write_files = 90 [json_name = "writeFiles", (.buf.validate.field) = { - void clear_write_files() ; - ::uint32_t write_files() const; - void set_write_files(::uint32_t value); - - private: - ::uint32_t _internal_write_files() const; - void _internal_set_write_files(::uint32_t value); - - public: - // bool protobuf_list_length_delimit = 121 [json_name = "protobufListLengthDelimit", (.buf.validate.field) = { - void clear_protobuf_list_length_delimit() ; - bool protobuf_list_length_delimit() const; - void set_protobuf_list_length_delimit(bool value); - - private: - bool _internal_protobuf_list_length_delimit() const; - void _internal_set_protobuf_list_length_delimit(bool value); - public: // uint64 modulus = 110 [json_name = "modulus", (.buf.validate.field) = { void clear_modulus() ; @@ -1156,6 +1139,16 @@ class XtcpConfig final : public ::google::protobuf::Message ::uint64_t _internal_modulus() const; void _internal_set_modulus(::uint64_t value); + public: + // uint32 write_files = 90 [json_name = "writeFiles", (.buf.validate.field) = { + void clear_write_files() ; + ::uint32_t write_files() const; + void set_write_files(::uint32_t value); + + private: + ::uint32_t _internal_write_files() const; + void _internal_set_write_files(::uint32_t value); + public: // uint32 dest_write_files = 135 [json_name = "destWriteFiles", (.buf.validate.field) = { void clear_dest_write_files() ; @@ -1176,6 +1169,26 @@ class XtcpConfig final : public ::google::protobuf::Message ::uint32_t _internal_debug_level() const; void _internal_set_debug_level(::uint32_t value); + public: + // bool protobuf_list_length_delimit = 121 [json_name = "protobufListLengthDelimit", (.buf.validate.field) = { + void clear_protobuf_list_length_delimit() ; + bool protobuf_list_length_delimit() const; + void set_protobuf_list_length_delimit(bool value); + + private: + bool _internal_protobuf_list_length_delimit() const; + void _internal_set_protobuf_list_length_delimit(bool value); + + public: + // bool io_uring = 210 [json_name = "ioUring", (.buf.validate.field) = { + void clear_io_uring() ; + bool io_uring() const; + void set_io_uring(bool value); + + private: + bool _internal_io_uring() const; + void _internal_set_io_uring(bool value); + public: // uint32 grpc_port = 190 [json_name = "grpcPort", (.buf.validate.field) = { void clear_grpc_port() ; @@ -1186,13 +1199,33 @@ class XtcpConfig final : public ::google::protobuf::Message ::uint32_t _internal_grpc_port() const; void _internal_set_grpc_port(::uint32_t value); + public: + // uint32 io_uring_recv_batch_size = 211 [json_name = "ioUringRecvBatchSize", (.buf.validate.field) = { + void clear_io_uring_recv_batch_size() ; + ::uint32_t io_uring_recv_batch_size() const; + void set_io_uring_recv_batch_size(::uint32_t value); + + private: + ::uint32_t _internal_io_uring_recv_batch_size() const; + void _internal_set_io_uring_recv_batch_size(::uint32_t value); + + public: + // uint32 io_uring_cqe_batch_size = 212 [json_name = "ioUringCqeBatchSize", (.buf.validate.field) = { + void clear_io_uring_cqe_batch_size() ; + ::uint32_t io_uring_cqe_batch_size() const; + void set_io_uring_cqe_batch_size(::uint32_t value); + + private: + ::uint32_t _internal_io_uring_cqe_batch_size() const; + void _internal_set_io_uring_cqe_batch_size(::uint32_t value); + public: // @@protoc_insertion_point(class_scope:xtcp_config.v1.XtcpConfig) private: class _Internal; friend class ::google::protobuf::internal::TcParser; static const ::google::protobuf::internal::TcParseTable< - 5, 25, 4, + 5, 28, 4, 128, 27> _table_; @@ -1231,12 +1264,15 @@ class XtcpConfig final : public ::google::protobuf::Message ::uint64_t packet_size_; ::uint32_t nlmsg_seq_; ::uint32_t packet_size_mply_; - ::uint32_t write_files_; - bool protobuf_list_length_delimit_; ::uint64_t modulus_; + ::uint32_t write_files_; ::uint32_t dest_write_files_; ::uint32_t debug_level_; + bool protobuf_list_length_delimit_; + bool io_uring_; ::uint32_t grpc_port_; + ::uint32_t io_uring_recv_batch_size_; + ::uint32_t io_uring_cqe_batch_size_; PROTOBUF_TSAN_DECLARE_MEMBER }; union { Impl_ _impl_; }; @@ -3672,6 +3708,72 @@ inline void XtcpConfig::set_allocated_enabled_deserializers(::xtcp_config::v1::E // @@protoc_insertion_point(field_set_allocated:xtcp_config.v1.XtcpConfig.enabled_deserializers) } +// bool io_uring = 210 [json_name = "ioUring", (.buf.validate.field) = { +inline void XtcpConfig::clear_io_uring() { + ::google::protobuf::internal::TSanWrite(&_impl_); + _impl_.io_uring_ = false; +} +inline bool XtcpConfig::io_uring() const { + // @@protoc_insertion_point(field_get:xtcp_config.v1.XtcpConfig.io_uring) + return _internal_io_uring(); +} +inline void XtcpConfig::set_io_uring(bool value) { + _internal_set_io_uring(value); + // @@protoc_insertion_point(field_set:xtcp_config.v1.XtcpConfig.io_uring) +} +inline bool XtcpConfig::_internal_io_uring() const { + ::google::protobuf::internal::TSanRead(&_impl_); + return _impl_.io_uring_; +} +inline void XtcpConfig::_internal_set_io_uring(bool value) { + ::google::protobuf::internal::TSanWrite(&_impl_); + _impl_.io_uring_ = value; +} + +// uint32 io_uring_recv_batch_size = 211 [json_name = "ioUringRecvBatchSize", (.buf.validate.field) = { +inline void XtcpConfig::clear_io_uring_recv_batch_size() { + ::google::protobuf::internal::TSanWrite(&_impl_); + _impl_.io_uring_recv_batch_size_ = 0u; +} +inline ::uint32_t XtcpConfig::io_uring_recv_batch_size() const { + // @@protoc_insertion_point(field_get:xtcp_config.v1.XtcpConfig.io_uring_recv_batch_size) + return _internal_io_uring_recv_batch_size(); +} +inline void XtcpConfig::set_io_uring_recv_batch_size(::uint32_t value) { + _internal_set_io_uring_recv_batch_size(value); + // @@protoc_insertion_point(field_set:xtcp_config.v1.XtcpConfig.io_uring_recv_batch_size) +} +inline ::uint32_t XtcpConfig::_internal_io_uring_recv_batch_size() const { + ::google::protobuf::internal::TSanRead(&_impl_); + return _impl_.io_uring_recv_batch_size_; +} +inline void XtcpConfig::_internal_set_io_uring_recv_batch_size(::uint32_t value) { + ::google::protobuf::internal::TSanWrite(&_impl_); + _impl_.io_uring_recv_batch_size_ = value; +} + +// uint32 io_uring_cqe_batch_size = 212 [json_name = "ioUringCqeBatchSize", (.buf.validate.field) = { +inline void XtcpConfig::clear_io_uring_cqe_batch_size() { + ::google::protobuf::internal::TSanWrite(&_impl_); + _impl_.io_uring_cqe_batch_size_ = 0u; +} +inline ::uint32_t XtcpConfig::io_uring_cqe_batch_size() const { + // @@protoc_insertion_point(field_get:xtcp_config.v1.XtcpConfig.io_uring_cqe_batch_size) + return _internal_io_uring_cqe_batch_size(); +} +inline void XtcpConfig::set_io_uring_cqe_batch_size(::uint32_t value) { + _internal_set_io_uring_cqe_batch_size(value); + // @@protoc_insertion_point(field_set:xtcp_config.v1.XtcpConfig.io_uring_cqe_batch_size) +} +inline ::uint32_t XtcpConfig::_internal_io_uring_cqe_batch_size() const { + ::google::protobuf::internal::TSanRead(&_impl_); + return _impl_.io_uring_cqe_batch_size_; +} +inline void XtcpConfig::_internal_set_io_uring_cqe_batch_size(::uint32_t value) { + ::google::protobuf::internal::TSanWrite(&_impl_); + _impl_.io_uring_cqe_batch_size_ = value; +} + // ------------------------------------------------------------------- // ------------------------------------------------------------------- diff --git a/gen/xtcp_config/v1/xtcp_config.pb.validate.cc b/gen/xtcp_config/v1/xtcp_config.pb.validate.cc index 52363da..567340f 100644 --- a/gen/xtcp_config/v1/xtcp_config.pb.validate.cc +++ b/gen/xtcp_config/v1/xtcp_config.pb.validate.cc @@ -747,6 +747,57 @@ return false; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -850,7 +901,7 @@ return false; } } - +// no validation rules for io_uring// no validation rules for io_uring_recv_batch_size// no validation rules for io_uring_cqe_batch_size return true; } diff --git a/go.mod b/go.mod index 06e3803..aa55e36 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/randomizedcoder/xtcp2 -go 1.24.1 +go 1.25 //replace ./pkg/xtcp_config => ./pkg/xtcp_config @@ -17,7 +17,7 @@ require ( github.com/twmb/franz-go/pkg/sr v1.3.0 github.com/twmb/franz-go/plugin/kprom v1.2.0 github.com/vmihailenco/msgpack/v5 v5.4.1 - golang.org/x/sys v0.32.0 + golang.org/x/sys v0.38.0 google.golang.org/genproto/googleapis/api v0.0.0-20250409194420-de1ac958c67a google.golang.org/grpc v1.71.1 google.golang.org/protobuf v1.36.6 @@ -43,16 +43,15 @@ require ( github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.63.0 // indirect github.com/prometheus/procfs v0.16.0 // indirect + github.com/randomizedcoder/giouring v0.0.0-00010101000000-000000000000 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/twmb/franz-go/pkg/kmsg v1.11.1 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect - go.opentelemetry.io/otel v1.34.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect golang.org/x/crypto v0.37.0 // indirect golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 // indirect golang.org/x/net v0.39.0 // indirect golang.org/x/text v0.24.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250409194420-de1ac958c67a // indirect - google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.5.1 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) + +replace github.com/randomizedcoder/giouring => /home/das/Downloads/giouring diff --git a/go.sum b/go.sum index 5c0689c..e31a2bf 100644 --- a/go.sum +++ b/go.sum @@ -1,11 +1,5 @@ -buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.1-20241127180247-a33202765966.1 h1:v223wh/bhlSHSc0tU9PXRWXHhkw3UWMtth7TmYGfHAQ= -buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.1-20241127180247-a33202765966.1/go.mod h1:/zlFuuECgFgewxwW6qQKgvMJ07YZkWlVkcSxEhJprJw= buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.6-20250307204501-0409229c3780.1 h1:zgJPqo17m28+Lf5BW4xv3PvU20BnrmTcGYrog22lLIU= buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.6-20250307204501-0409229c3780.1/go.mod h1:avRlCjnFzl98VPaeCtJ24RrV/wwHFzB8sWXhj26+n/U= -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -cel.dev/expr v0.22.1 h1:xoFEsNh972Yzey8N9TCPx2nDvMN7TMhQEzxLuj/iRrI= -cel.dev/expr v0.22.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= cel.dev/expr v0.23.1 h1:K4KOtPCJQjVggkARsjG9RWXP6O4R73aHeJMa/dmCQQg= cel.dev/expr v0.23.1/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= @@ -16,10 +10,6 @@ github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= -github.com/bufbuild/protovalidate-go v0.8.2 h1:sgzXHkHYP6HnAsL2Rd3I1JxkYUyEQUv9awU1PduMxbM= -github.com/bufbuild/protovalidate-go v0.8.2/go.mod h1:K6w8iPNAXBoIivVueSELbUeUl+MmeTQfCDSug85pn3M= -github.com/bufbuild/protovalidate-go v0.9.2 h1:dUoPvFimovS74s3eeFNvHQOxFumRPsk390ifkzJCJ/4= -github.com/bufbuild/protovalidate-go v0.9.2/go.mod h1:U9+WHAa6IOrLuqQEWPcxsyE4QEOTwm9fDpVbWXsR0zU= github.com/bufbuild/protovalidate-go v0.9.3 h1:XvdtwQuppS3wjzGfpOirsqwN5ExH2+PiIuA/XZd3MTM= github.com/bufbuild/protovalidate-go v0.9.3/go.mod h1:2lUDP6fNd3wxznRNH3Nj64VB07+PySeslamkerwP6tE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -38,9 +28,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= -github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6Uu2PdjCQwWCJ3bM= -github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= +github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= github.com/felixge/fgprof v0.9.5 h1:8+vR6yu2vvSKn08urWyEuxx75NWPEvybbkBirEpsbVY= github.com/felixge/fgprof v0.9.5/go.mod h1:yKl+ERSa++RYOs32d8K6WEXCB4uXdLls4ZaZPpayhMM= @@ -56,22 +45,14 @@ github.com/gobwas/ws v1.2.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/K github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/cel-go v0.22.1 h1:AfVXx3chM2qwoSbM7Da8g8hX8OVSkBFwX+rz2+PcK40= -github.com/google/cel-go v0.22.1/go.mod h1:BuznPXXfQDpXKWQ9sPW3TzlAJN5zzFe+i9tIs0yC4s8= github.com/google/cel-go v0.24.1 h1:jsBCtxG8mM5wiUJDSGUqU0K7Mtr3w7Eyv00rw4DiZxI= github.com/google/cel-go v0.24.1/go.mod h1:Hdf9TqOaTNSFQA1ybQaRqATVoK7m/zcf7IMhGXP5zI8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7/go.mod h1:czg5+yv1E0ZGTi6S6vVK1mke0fV+FaUhNGcd6VRS9Ik= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= -github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= -github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -81,8 +62,6 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1ns github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= github.com/ianlancetaylor/demangle v0.0.0-20230524184225-eabc099b10ab/go.mod h1:gx7rwoVhcfuVKG5uya9Hs3Sxj7EIvldVofAWIUtGouw= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= -github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= -github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= @@ -91,14 +70,8 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/nats-io/nats.go v1.37.0 h1:07rauXbVnnJvv1gfIyghFEo6lUcYRY0WXc3x7x0vUxE= -github.com/nats-io/nats.go v1.37.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= -github.com/nats-io/nats.go v1.40.0 h1:qC3rnVZy15vJ15GSbB+pQtOmqo9q+65wnGVpvmcVv0Q= -github.com/nats-io/nats.go v1.40.0/go.mod h1:wV73x0FSI/orHPSYoyMeJB+KajMDoWyXmFaRrrYaaTo= github.com/nats-io/nats.go v1.41.1 h1:lCc/i5x7nqXbspxtmXaV4hRguMPHqE/kYltG9knrCdU= github.com/nats-io/nats.go v1.41.1/go.mod h1:mzHiutcAdZrg6WLfYVKXGseqqow2fWmwlTEUOHsI4jY= -github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= -github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= github.com/nats-io/nkeys v0.4.10 h1:glmRrpCmYLHByYcePvnTBEAwawwapjCPMjy2huw20wc= github.com/nats-io/nkeys v0.4.10/go.mod h1:OjRrnIKnWBFl+s4YK5ChQfvHP2fxqZexrKJoVVyWB3U= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= @@ -112,26 +85,14 @@ github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= -github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= -github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ= -github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s= github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/prometheus/procfs v0.16.0 h1:xh6oHhKwnOJKMYiYBDWmkHqQPyiY40sny36Cmx2bbsM= github.com/prometheus/procfs v0.16.0/go.mod h1:8veyXUu3nGP7oaCxhX6yeaM5u4stL2FeMXnCqhDthZg= -github.com/redis/go-redis/v9 v9.6.1 h1:HHDteefn6ZkTtY5fGUE8tj8uy85AHk6zP7CpzIAM0y4= -github.com/redis/go-redis/v9 v9.6.1/go.mod h1:0C0c6ycQsdpVNQpxb1njEQIqkx5UcsM8FJCQLgE9+RA= github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM= github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA= github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= @@ -144,18 +105,12 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/twmb/franz-go v1.18.0 h1:25FjMZfdozBywVX+5xrWC2W+W76i0xykKjTdEeD2ejw= -github.com/twmb/franz-go v1.18.0/go.mod h1:zXCGy74M0p5FbXsLeASdyvfLFsBvTubVqctIaa5wQ+I= github.com/twmb/franz-go v1.18.1 h1:D75xxCDyvTqBSiImFx2lkPduE39jz1vaD7+FNc+vMkc= github.com/twmb/franz-go v1.18.1/go.mod h1:Uzo77TarcLTUZeLuGq+9lNpSkfZI+JErv7YJhlDjs9M= -github.com/twmb/franz-go/pkg/kmsg v1.9.0 h1:JojYUph2TKAau6SBtErXpXGC7E3gg4vGZMv9xFU/B6M= -github.com/twmb/franz-go/pkg/kmsg v1.9.0/go.mod h1:CMbfazviCyY6HM0SXuG5t9vOwYDHRCSrJJyBAe5paqg= github.com/twmb/franz-go/pkg/kmsg v1.11.1 h1:cuW0wIrdZJQ8NZ5ba+jq0OIOdpP0yuRjPeuE8eYodZw= github.com/twmb/franz-go/pkg/kmsg v1.11.1/go.mod h1:CFfkkLysDNmukPYhGzuUcDtf46gQSqCZHMW1T4Z+wDE= github.com/twmb/franz-go/pkg/sr v1.3.0 h1:UlXpZ2suGgylzQBUb6Wn1jzqVShoPGzt7BbixznJ4qo= github.com/twmb/franz-go/pkg/sr v1.3.0/go.mod h1:gpd2Xl5/prkj3gyugcL+rVzagjaxFqMgvKMYcUlrpDw= -github.com/twmb/franz-go/plugin/kprom v1.1.0 h1:grGeIJbm4llUBF8jkDjTb/b8rKllWSXjMwIqeCCcNYQ= -github.com/twmb/franz-go/plugin/kprom v1.1.0/go.mod h1:cTDrPMSkyrO99LyGx3AtiwF9W6+THHjZrkDE2+TEBIU= github.com/twmb/franz-go/plugin/kprom v1.2.0 h1:BCl9Uj46cpniMfuqKA0IIHPgcx6syqEZ+H6MaQNSD4U= github.com/twmb/franz-go/plugin/kprom v1.2.0/go.mod h1:+dzpKnVE6By8BDRFj240dTDJS9bP2dngmuhv7egJ3Go= github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8= @@ -170,66 +125,29 @@ go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.32.0 h1:rZvFnvmvawYb0alrYkjraqJq0Z4ZUJAiyYCU9snn1CU= -go.opentelemetry.io/otel/sdk/metric v1.32.0/go.mod h1:PWeZlq0zt9YkYAp3gjKZ0eicRYvOh1Gd+X99x6GHpCQ= go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= +go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= -golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= -golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= -golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= -golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= -golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 h1:1UoZQm6f0P/ZO0w1Ri+f+ifG/gXhegadRdwBIXEFWDo= -golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67/go.mod h1:qj5a5QZpwLU2NLQudwIN5koi3beDhSAlJwa67PuM98c= -golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 h1:nDVHiLt8aIbd/VzvPWN6kSOPE7+F/fNFDSXLVYkE/Iw= -golang.org/x/exp v0.0.0-20250305212735-054e65f0b394/go.mod h1:sIifuuw/Yco/y6yb6+bDNfyeQ/MdPUy/hKEMYQV17cM= golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 h1:R84qjqJb5nVJMxqWYb3np9L5ZsaDtB+a39EqjV0JSUM= golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0/go.mod h1:S9Xr4PYopiDyqSyp5NjCrhFrqg6A5zA2E/iPHPhqnS8= -golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= -golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= -golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c= -golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/api v0.0.0-20250313205543-e70fdf4c4cb4 h1:IFnXJq3UPB3oBREOodn1v1aGQeZYQclEmvWRMN0PSsY= -google.golang.org/genproto/googleapis/api v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:c8q6Z6OCqnfVIqUFJkCzKcrj8eCvUrz+K4KRzSTuANg= google.golang.org/genproto/googleapis/api v0.0.0-20250409194420-de1ac958c67a h1:OQ7sHVzkx6L57dQpzUS4ckfWJ51KDH74XHTDe23xWAs= google.golang.org/genproto/googleapis/api v0.0.0-20250409194420-de1ac958c67a/go.mod h1:2R6XrVC8Oc08GlNh8ujEpc7HkLiEZ16QeY7FxIs20ac= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 h1:iK2jbkWL86DXjEx0qiHcRE9dE4/Ahua5k6V8OWFb//c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= google.golang.org/genproto/googleapis/rpc v0.0.0-20250409194420-de1ac958c67a h1:GIqLhp/cYUkuGuiT+vJk8vhOP86L4+SP5j8yXgeVpvI= google.golang.org/genproto/googleapis/rpc v0.0.0-20250409194420-de1ac958c67a/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ= -google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.5.1 h1:F29+wU6Ee6qgu9TddPgooOdaqsxTMunOoj8KA5yuS5A= -google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.5.1/go.mod h1:5KF+wpkbTSbGcR9zteSqZV6fqFOWBl4Yde8En8MryZA= -google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= -google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/pkg/io_uring/bench_test.go b/pkg/io_uring/bench_test.go new file mode 100644 index 0000000..96f28fd --- /dev/null +++ b/pkg/io_uring/bench_test.go @@ -0,0 +1,307 @@ +package io_uring + +import ( + "runtime" + "sync" + "syscall" + "testing" +) + +// payloadSize matches a small INET_DIAG response — large enough to be +// realistic, small enough to fit a high-fanout benchmark in a few seconds. +const payloadSize = 256 + +// recvBufSize is what each pool buffer is sized to — matches the default +// xtcp2 packet buffer (~32 KB), large enough for many netlink messages. +const recvBufSize = 32 * 1024 + +// rusageDelta captures user/system CPU time around the benchmark body. +type rusageDelta struct { + user int64 // microseconds + sys int64 + maj int64 + nvcs int64 + nivs int64 +} + +func snapshotRusage(b *testing.B) rusageDelta { + b.Helper() + var ru syscall.Rusage + if err := syscall.Getrusage(syscall.RUSAGE_SELF, &ru); err != nil { + b.Fatalf("Getrusage: %v", err) + } + return rusageDelta{ + user: ru.Utime.Sec*1e6 + int64(ru.Utime.Usec), + sys: ru.Stime.Sec*1e6 + int64(ru.Stime.Usec), + maj: ru.Majflt, + nvcs: ru.Nvcsw, + nivs: ru.Nivcsw, + } +} + +func reportRusage(b *testing.B, before, after rusageDelta) { + b.Helper() + div := float64(b.N) + if div == 0 { + div = 1 + } + b.ReportMetric(float64(after.user-before.user)/div, "user_us/op") + b.ReportMetric(float64(after.sys-before.sys)/div, "sys_us/op") + b.ReportMetric(float64(after.nvcs-before.nvcs)/div, "nvcsw/op") + b.ReportMetric(float64(after.nivs-before.nivs)/div, "nivcsw/op") +} + +func makePayload() []byte { + p := make([]byte, payloadSize) + for i := range p { + p[i] = byte(i) + } + return p +} + +func newSendBufPool() *sync.Pool { + return &sync.Pool{New: func() any { + b := make([]byte, payloadSize) + return &b + }} +} + +func newRecvBufPool() *sync.Pool { + return &sync.Pool{New: func() any { + b := make([]byte, recvBufSize) + return &b + }} +} + +func drainerLoop(b *testing.B, fd int, stop <-chan struct{}) { + b.Helper() + go func() { + buf := make([]byte, recvBufSize) + for { + select { + case <-stop: + return + default: + } + if _, err := syscall.Read(fd, buf); err != nil { + return + } + } + }() +} + +// BenchmarkSyscallSend baseline: one syscall.Write per record. +func BenchmarkSyscallSend(b *testing.B) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + srv, cli := socketpair(b) + stop := make(chan struct{}) + defer close(stop) + drainerLoop(b, srv, stop) + + payload := makePayload() + b.SetBytes(int64(len(payload))) + b.ReportAllocs() + before := snapshotRusage(b) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := syscall.Write(cli, payload); err != nil { + b.Fatalf("write: %v", err) + } + } + + b.StopTimer() + after := snapshotRusage(b) + reportRusage(b, before, after) +} + +// benchmarkIoUringSend pre-fills a window of `batch` send SQEs, submits +// them with one Submit call, drains `batch` CQEs, then refills — the +// realistic high-fanout pattern. The in-flight count stays bounded by +// `batch`, so we never hit the in-flight cap. +func benchmarkIoUringSend(b *testing.B, batch int) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if batch < 1 { + batch = 1 + } + r, err := New(Config{RecvBatchSize: batch, CQEBatchSize: batch}) + if err != nil { + b.Skipf("io_uring not available: %v", err) + } + defer r.Close(100_000_000, nil) + + srv, cli := socketpair(b) + stop := make(chan struct{}) + defer close(stop) + drainerLoop(b, srv, stop) + + pool := newSendBufPool() + payload := makePayload() + + b.SetBytes(int64(len(payload))) + b.ReportAllocs() + before := snapshotRusage(b) + b.ResetTimer() + + sent := 0 + for sent < b.N { + // Fill a window of `batch` sends (bounded by remaining work). + window := batch + if sent+window > b.N { + window = b.N - sent + } + for j := 0; j < window; j++ { + buf := pool.Get().(*[]byte) + copy(*buf, payload) + *buf = (*buf)[:len(payload)] + if _, err := r.EnqueueSend(cli, buf, OpSendUnixGram); err != nil { + b.Fatalf("EnqueueSend: %v", err) + } + } + if _, err := r.Submit(); err != nil { + b.Fatalf("Submit: %v", err) + } + // Drain the whole window before refilling. + drained := 0 + for drained < window { + results, err := r.WaitOne() + if err != nil { + b.Fatalf("WaitOne: %v", err) + } + for _, res := range results { + if res.Buf != nil { + *res.Buf = (*res.Buf)[:cap(*res.Buf)] + pool.Put(res.Buf) + } + } + drained += len(results) + } + sent += window + } + + b.StopTimer() + after := snapshotRusage(b) + reportRusage(b, before, after) + b.ReportMetric(float64(batch), "batch") +} + +func BenchmarkIoUringSendBatch1(b *testing.B) { benchmarkIoUringSend(b, 1) } +func BenchmarkIoUringSendBatch16(b *testing.B) { benchmarkIoUringSend(b, 16) } +func BenchmarkIoUringSendBatch64(b *testing.B) { benchmarkIoUringSend(b, 64) } +func BenchmarkIoUringSendBatch256(b *testing.B) { benchmarkIoUringSend(b, 256) } + +// BenchmarkSyscallRecv baseline: one syscall.Recvfrom per record, using a +// single reused buffer (so allocs/op is zero, fair vs the io_uring path +// that uses a sync.Pool). +func BenchmarkSyscallRecv(b *testing.B) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + srv, cli := socketpair(b) + payload := makePayload() + + go func() { + for i := 0; i < b.N; i++ { + if _, err := syscall.Write(srv, payload); err != nil { + return + } + } + }() + + buf := make([]byte, recvBufSize) + b.SetBytes(int64(len(payload))) + b.ReportAllocs() + before := snapshotRusage(b) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, _, err := syscall.Recvfrom(cli, buf, 0); err != nil { + b.Fatalf("Recvfrom: %v", err) + } + } + + b.StopTimer() + after := snapshotRusage(b) + reportRusage(b, before, after) +} + +// benchmarkIoUringRecv pre-fills a window of `batch` recv SQEs from a +// sync.Pool, drains them in a batch, returns buffers to the pool, and +// refills. Mirrors the design intent: many recvs per Submit/Drain syscall. +func benchmarkIoUringRecv(b *testing.B, batch int) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if batch < 1 { + batch = 1 + } + r, err := New(Config{RecvBatchSize: batch, CQEBatchSize: batch}) + if err != nil { + b.Skipf("io_uring not available: %v", err) + } + defer r.Close(100_000_000, nil) + + srv, cli := socketpair(b) + payload := makePayload() + + go func() { + for i := 0; i < b.N; i++ { + if _, err := syscall.Write(srv, payload); err != nil { + return + } + } + }() + + pool := newRecvBufPool() + + b.SetBytes(int64(len(payload))) + b.ReportAllocs() + before := snapshotRusage(b) + b.ResetTimer() + + processed := 0 + for processed < b.N { + window := batch + if processed+window > b.N { + window = b.N - processed + } + for j := 0; j < window; j++ { + buf := pool.Get().(*[]byte) + *buf = (*buf)[:recvBufSize] + if _, err := r.EnqueueRecvMsg(cli, buf); err != nil { + b.Fatalf("EnqueueRecvMsg: %v", err) + } + } + if _, err := r.Submit(); err != nil { + b.Fatalf("Submit: %v", err) + } + drained := 0 + for drained < window { + results, err := r.WaitOne() + if err != nil { + b.Fatalf("WaitOne: %v", err) + } + for _, res := range results { + if res.Buf != nil { + pool.Put(res.Buf) + } + } + drained += len(results) + } + processed += window + } + + b.StopTimer() + after := snapshotRusage(b) + reportRusage(b, before, after) + b.ReportMetric(float64(batch), "batch") +} + +func BenchmarkIoUringRecvBatch1(b *testing.B) { benchmarkIoUringRecv(b, 1) } +func BenchmarkIoUringRecvBatch16(b *testing.B) { benchmarkIoUringRecv(b, 16) } +func BenchmarkIoUringRecvBatch64(b *testing.B) { benchmarkIoUringRecv(b, 64) } +func BenchmarkIoUringRecvBatch256(b *testing.B) { benchmarkIoUringRecv(b, 256) } diff --git a/pkg/io_uring/codec.go b/pkg/io_uring/codec.go index 139776f..0d97e62 100644 --- a/pkg/io_uring/codec.go +++ b/pkg/io_uring/codec.go @@ -1,28 +1,52 @@ +// Package io_uring is the xtcp2-internal io_uring helper layer. It owns +// per-Netlinker ring lifecycle, the canonical 64-bit userdata encoding +// used to tag every SQE, and the buffer-ownership map that keeps pool +// buffers alive between submission and completion. +// +// See /home/das/.claude/profiles/runpod/plans/in-this-repo-there-starry-tiger.md +// for the design rationale. package io_uring +// Operation tags every CQE so the netlinker goroutine can dispatch +// completions back to the right consumer without a side-channel lookup. +// +// Wire layout of the 64-bit userdata stamped on each SQE: +// +// bits 63..56 Operation (uint8) +// bits 55..32 reserved (24 bits) — must be zero +// bits 31..0 RequestID (uint32) — per-ring monotonic counter +// +// NsID is intentionally absent: the ring is per-Netlinker, so the netns +// is already implied by the goroutine that owns the ring. +type Operation uint8 + +const ( + // OpRead — a recvmsg SQE submitted against the netlink fd. + OpRead Operation = 0 + // OpSendUDP — a send SQE submitted against the udp dest fd. + OpSendUDP Operation = 1 + // OpSendUnix — a writev SQE (header + payload iovec) submitted + // against the unix-stream dest fd. + OpSendUnix Operation = 2 + // OpSendUnixGram — a send SQE submitted against the unixgram dest fd. + OpSendUnixGram Operation = 3 +) + +// EncodedRequest is the in-memory representation of a CQE userdata. type EncodedRequest struct { - Operation uint8 // Operation: "read=0", "write=1" - NsID uint16 // Network namespace ID - RequestID uint32 // Request ID + Operation Operation + RequestID uint32 } -// serialize converts an EncodedRequest into a uint64. -func serialize(req *EncodedRequest) uint64 { - var result uint64 - result |= uint64(req.Operation) << 56 // Store Operation in the highest 8 bits - result |= uint64(req.NsID) << 40 // Store NsID in the next 16 bits - result |= uint64(req.RequestID) & 0xFFFFFFFF // Store RequestID in the lowest 32 bits - return result +// serialize packs an EncodedRequest into a 64-bit userdata word. +func serialize(req EncodedRequest) uint64 { + return uint64(req.Operation)<<56 | uint64(req.RequestID) } -// deserialize converts a uint64 back into an EncodedRequest. +// deserialize unpacks a 64-bit userdata word back into an EncodedRequest. func deserialize(data uint64) EncodedRequest { - operation := uint8(data >> 56) // Extract the highest 8 bits - nsID := uint16((data >> 40) & 0xFFFF) // Extract the next 16 bits - requestID := uint32(data & 0xFFFFFFFF) // Extract the lowest 32 bits return EncodedRequest{ - Operation: operation, - NsID: nsID, - RequestID: requestID, + Operation: Operation(data >> 56), + RequestID: uint32(data), } -} \ No newline at end of file +} diff --git a/pkg/io_uring/codec_test.go b/pkg/io_uring/codec_test.go index 350e609..582152d 100644 --- a/pkg/io_uring/codec_test.go +++ b/pkg/io_uring/codec_test.go @@ -2,46 +2,64 @@ package io_uring import "testing" -func TestSerialization(t *testing.T) { - tests := []struct { - name string - request EncodedRequest - expected uint64 +func TestCodecRoundTrip(t *testing.T) { + cases := []struct { + name string + req EncodedRequest + want uint64 }{ { - name: "Test1", - request: EncodedRequest{ - Operation: 0, - NsID: 1, - RequestID: 100, - }, - expected: 0x0000010000000064, + name: "read_reqid_0", + req: EncodedRequest{Operation: OpRead, RequestID: 0}, + want: 0x0000_0000_0000_0000, }, { - name: "Test2", - request: EncodedRequest{ - Operation: 1, - NsID: 65535, - RequestID: 4294967295, - }, - //expected: 0x01ffffffffffffffff, - expected: 0x0000010000000064, + name: "read_reqid_mid", + req: EncodedRequest{Operation: OpRead, RequestID: 0x12345678}, + want: 0x0000_0000_1234_5678, + }, + { + name: "send_udp_reqid_1", + req: EncodedRequest{Operation: OpSendUDP, RequestID: 1}, + want: 0x0100_0000_0000_0001, + }, + { + name: "send_unix_reqid_max", + req: EncodedRequest{Operation: OpSendUnix, RequestID: 0xFFFFFFFF}, + want: 0x0200_0000_FFFF_FFFF, + }, + { + name: "send_unixgram_reqid_high_bit", + req: EncodedRequest{Operation: OpSendUnixGram, RequestID: 0x8000_0000}, + want: 0x0300_0000_8000_0000, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Test serialization - got := serialize(&tt.request) - if got != tt.expected { - t.Errorf("serialize() = 0x%016x, want 0x%016x", got, tt.expected) + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := serialize(c.req) + if got != c.want { + t.Errorf("serialize(%+v) = 0x%016x, want 0x%016x", c.req, got, c.want) } - - // Test deserialization - gotRequest := deserialize(got) - if gotRequest != tt.request { - t.Errorf("deserialize() = %+v, want %+v", gotRequest, tt.request) + back := deserialize(got) + if back != c.req { + t.Errorf("deserialize(0x%016x) = %+v, want %+v", got, back, c.req) } }) } } + +// TestCodecReservedBitsAreZero confirms the 24 reserved bits stay zero +// for every (Operation, RequestID) pair — if we later use them, this is +// the test to update first. +func TestCodecReservedBitsAreZero(t *testing.T) { + for op := Operation(0); op < 0xFF; op++ { + for _, rid := range []uint32{0, 1, 0x12345678, 0xFFFFFFFF} { + u := serialize(EncodedRequest{Operation: op, RequestID: rid}) + reserved := (u >> 32) & 0x00FF_FFFF + if reserved != 0 { + t.Fatalf("op=%d rid=0x%x: reserved bits non-zero: 0x%x", op, rid, reserved) + } + } + } +} diff --git a/pkg/io_uring/ring.go b/pkg/io_uring/ring.go new file mode 100644 index 0000000..785c2eb --- /dev/null +++ b/pkg/io_uring/ring.go @@ -0,0 +1,366 @@ +package io_uring + +import ( + "errors" + "fmt" + "sync" + "syscall" + "time" + "unsafe" + + "github.com/randomizedcoder/giouring" +) + +// Setup flags chosen for "lighter on the system" semantics on a periodic +// netlink workload: +// - SetupSingleIssuer : kernel skips locking on the SQ side (Linux 6.0+). +// - SetupDeferTaskrun : completion task work runs only when the owner +// calls Submit/Wait, instead of at IRQ time (Linux 6.1+). +// - SetupCoopTaskrun : no inter-processor interrupt on CQE completion; +// wake on the next WaitCQE naturally. +// +// Deliberately NOT using SetupSQPoll — a kernel poll thread burns one CPU +// per ring continuously, which is catastrophic for a 1Hz polling tool. +// Document for future readers: SQPoll is for storage-style sustained +// submit workloads, not periodic dumps. +const setupFlags uint32 = giouring.SetupSingleIssuer | + giouring.SetupDeferTaskrun | + giouring.SetupCoopTaskrun + +// Required opcodes — if any are missing we refuse to enable io_uring mode. +// Centralised here so the panic message and the probe check stay in sync. +var requiredOps = []uint8{ + giouring.OpRecvmsg, + giouring.OpSend, + giouring.OpWritev, +} + +// Config tunes the ring's queue depths. +type Config struct { + // RecvBatchSize is the number of recvmsg SQEs we keep in flight. + // Higher reduces syscalls per dump cycle on high-fanout hosts at the + // cost of pinned packet-pool buffers. + RecvBatchSize int + // CQEBatchSize bounds each PeekBatchCQE call. + CQEBatchSize int +} + +// Result is what a drainer hands back to the netlinker for one CQE. +type Result struct { + Op Operation + // Res is the kernel's CQE result: bytes transferred when positive, + // -errno when negative. + Res int32 + // Buf is the buffer the operation owned. For OpRead, it points at the + // packet pool buffer that received bytes (caller slices to Res and + // returns to packetBufferPool). For send/writev ops, it's the + // destBytesPool buffer that was just written (caller returns to pool). + Buf *[]byte + // HdrBytes is the small per-send header for OpSendUnix (varint prefix); + // the caller doesn't need to do anything with it — the Ring owns it. + HdrBytes []byte +} + +// inFlight tracks every SQE we've submitted but whose CQE hasn't arrived. +// The structs it points at (Msghdr, Iovec) must outlive submission, so we +// stash them here keyed by RequestID. +type inFlight struct { + op Operation + buf *[]byte + // For OpRead: the Msghdr and its single Iovec, kept alive for the + // kernel to fill. + msg *syscall.Msghdr + iov *syscall.Iovec + // For OpSendUnix: the two-element iovec array (header + payload) + // passed to writev. Both must remain valid until the CQE. + wvIov *[2]syscall.Iovec + wvHdr []byte // backing storage for the varint length header +} + +// Ring is xtcp2's per-Netlinker io_uring wrapper. It is NOT safe for +// concurrent use — every method must run on the goroutine that created +// it. +type Ring struct { + r *giouring.Ring + cfg Config + cqeBuf []*giouring.CompletionQueueEvent + nextReqID uint32 + inFlight map[uint32]inFlight + inFlightCap int +} + +// New creates a Ring sized for the given config. sqEntries is +// max(RecvBatchSize*2, 256) so that refills never spill the SQ during +// drain. Panics with a clear "kernel too old" message if the probe shows +// any required opcode is missing — caller opted into io_uring, silent +// fallback would hide the fault. +func New(cfg Config) (*Ring, error) { + if cfg.RecvBatchSize < 1 { + return nil, errors.New("io_uring.New: RecvBatchSize must be >= 1") + } + if cfg.CQEBatchSize < 1 { + return nil, errors.New("io_uring.New: CQEBatchSize must be >= 1") + } + + if err := requireProbe(); err != nil { + return nil, err + } + + sqEntries := uint32(cfg.RecvBatchSize * 2) + if sqEntries < 256 { + sqEntries = 256 + } + + g := giouring.NewRing() + if err := g.QueueInit(sqEntries, setupFlags); err != nil { + return nil, fmt.Errorf("QueueInit(%d, flags=0x%x): %w", sqEntries, setupFlags, err) + } + + r := &Ring{ + r: g, + cfg: cfg, + cqeBuf: make([]*giouring.CompletionQueueEvent, cfg.CQEBatchSize), + inFlight: make(map[uint32]inFlight, sqEntries), + inFlightCap: int(sqEntries) * 2, // generous; refuse to leak unbounded + } + return r, nil +} + +// requireProbe asks the kernel which opcodes are supported and panics +// with a clear message if any of the ones we depend on are missing. +func requireProbe() error { + p, err := giouring.GetProbe() + if err != nil { + return fmt.Errorf("io_uring probe failed (kernel too old or io_uring disabled?): %w", err) + } + for _, op := range requiredOps { + if !p.IsSupported(op) { + return fmt.Errorf("io_uring opcode %d not supported by this kernel — need Linux 6.1+ for the configured setup flags (SingleIssuer+DeferTaskrun+CoopTaskrun)", op) + } + } + return nil +} + +// Close drains pending CQEs (best-effort, up to drainTimeout), releases +// any in-flight pool buffers back to the caller's drain callback, then +// unmaps the ring. Safe to call multiple times. +func (r *Ring) Close(drainTimeout time.Duration, onDrain func(Result)) { + if r == nil || r.r == nil { + return + } + deadline := time.Now().Add(drainTimeout) + for time.Now().Before(deadline) && len(r.inFlight) > 0 { + // First reap anything already arrived (non-blocking). + results, _ := r.drainOnce() + if len(results) == 0 { + // Nothing yet — block for one CQE with a short timeout. + remaining := time.Until(deadline) + if remaining <= 0 { + break + } + step := remaining + if step > 50*time.Millisecond { + step = 50 * time.Millisecond + } + ts := syscall.NsecToTimespec(int64(step)) + if _, err := r.r.WaitCQETimeout(&ts); err != nil { + // ETIME (timeout) is expected; anything else stops us. + if !errors.Is(err, syscall.ETIME) && err.Error() != "errno 62" { + break + } + continue + } + results, _ = r.drainOnce() + } + if onDrain != nil { + for _, res := range results { + onDrain(res) + } + } + } + r.r.QueueExit() + r.r = nil +} + +// NextRequestID returns a fresh per-ring monotonic counter value. +func (r *Ring) NextRequestID() uint32 { + r.nextReqID++ + return r.nextReqID +} + +// EnqueueRecvMsg builds an SQE that asks the kernel to do recvmsg(fd, buf) +// when data is available. The buf and the supporting Msghdr/Iovec stay +// pinned in the in-flight map until the CQE arrives. Returns the +// RequestID stamped in the SQE userdata. +func (r *Ring) EnqueueRecvMsg(fd int, buf *[]byte) (uint32, error) { + if buf == nil || len(*buf) == 0 { + return 0, errors.New("io_uring.EnqueueRecvMsg: empty buffer") + } + if len(r.inFlight) >= r.inFlightCap { + return 0, fmt.Errorf("io_uring in-flight cap exceeded (%d) — SQEs submitted faster than CQEs drained", r.inFlightCap) + } + sqe := r.r.GetSQE() + if sqe == nil { + return 0, errors.New("io_uring.EnqueueRecvMsg: SQ full (GetSQE returned nil)") + } + + iov := &syscall.Iovec{Base: &(*buf)[0], Len: uint64(len(*buf))} + msg := &syscall.Msghdr{ + Iov: iov, + Iovlen: 1, + } + sqe.PrepareRecvMsg(fd, msg, 0) + reqID := r.NextRequestID() + sqe.SetData64(serialize(EncodedRequest{Operation: OpRead, RequestID: reqID})) + + r.inFlight[reqID] = inFlight{op: OpRead, buf: buf, msg: msg, iov: iov} + return reqID, nil +} + +// EnqueueSend builds a `send(2)` SQE. For UDP / unixgram destinations the +// kernel preserves the message boundary. Op is one of OpSendUDP or +// OpSendUnixGram. +func (r *Ring) EnqueueSend(fd int, buf *[]byte, op Operation) (uint32, error) { + if buf == nil { + return 0, errors.New("io_uring.EnqueueSend: nil buffer") + } + if op != OpSendUDP && op != OpSendUnixGram { + return 0, fmt.Errorf("io_uring.EnqueueSend: unsupported op %d (want OpSendUDP or OpSendUnixGram)", op) + } + if len(r.inFlight) >= r.inFlightCap { + return 0, fmt.Errorf("io_uring in-flight cap exceeded (%d) — SQEs submitted faster than CQEs drained", r.inFlightCap) + } + sqe := r.r.GetSQE() + if sqe == nil { + return 0, errors.New("io_uring.EnqueueSend: SQ full (GetSQE returned nil)") + } + + addr := uintptr(0) + length := uint32(len(*buf)) + if length > 0 { + addr = uintptr(unsafe.Pointer(&(*buf)[0])) + } + sqe.PrepareSend(fd, addr, length, 0) + reqID := r.NextRequestID() + sqe.SetData64(serialize(EncodedRequest{Operation: op, RequestID: reqID})) + + r.inFlight[reqID] = inFlight{op: op, buf: buf} + return reqID, nil +} + +// EnqueueWritevUnix submits a 2-iovec writev to deliver a varint-prefixed +// frame (header + payload) atomically on a SOCK_STREAM unix socket. The +// header bytes and iovec array are stashed in the in-flight map; the +// payload buffer is borrowed from destBytesPool by the caller and +// returned to the pool on CQE reap. +func (r *Ring) EnqueueWritevUnix(fd int, header []byte, payload *[]byte) (uint32, error) { + if payload == nil { + return 0, errors.New("io_uring.EnqueueWritevUnix: nil payload") + } + if len(header) == 0 { + return 0, errors.New("io_uring.EnqueueWritevUnix: empty header") + } + if len(r.inFlight) >= r.inFlightCap { + return 0, fmt.Errorf("io_uring in-flight cap exceeded (%d)", r.inFlightCap) + } + sqe := r.r.GetSQE() + if sqe == nil { + return 0, errors.New("io_uring.EnqueueWritevUnix: SQ full") + } + + // Allocate iov on the heap so it survives until the CQE arrives; + // taking the address of a local [2]Iovec would point at a stack + // slot that's recycled the moment this function returns. + iov := new([2]syscall.Iovec) + hdrCopy := make([]byte, len(header)) + copy(hdrCopy, header) + iov[0] = syscall.Iovec{Base: &hdrCopy[0], Len: uint64(len(hdrCopy))} + if len(*payload) > 0 { + iov[1] = syscall.Iovec{Base: &(*payload)[0], Len: uint64(len(*payload))} + } + + iovPtr := uintptr(unsafe.Pointer(&iov[0])) + sqe.PrepareWritev(fd, iovPtr, 2, 0) + reqID := r.NextRequestID() + sqe.SetData64(serialize(EncodedRequest{Operation: OpSendUnix, RequestID: reqID})) + + r.inFlight[reqID] = inFlight{ + op: OpSendUnix, + buf: payload, + wvIov: iov, + wvHdr: hdrCopy, + } + return reqID, nil +} + +// Submit flushes the SQ to the kernel in one syscall. +func (r *Ring) Submit() (int, error) { + n, err := r.r.Submit() + return int(n), err +} + +// SubmitAndWait flushes the SQ and waits for at least `waitNr` +// completions. Used by drain loops that want a single syscall round-trip. +func (r *Ring) SubmitAndWait(waitNr uint32) (int, error) { + n, err := r.r.SubmitAndWait(waitNr) + return int(n), err +} + +// DrainBatch reaps up to CQEBatchSize CQEs without blocking, decodes each +// back into a Result by looking up its in-flight entry, and returns the +// slice (shared backing — caller must consume before next call). +func (r *Ring) DrainBatch() []Result { + results, _ := r.drainOnce() + return results +} + +func (r *Ring) drainOnce() ([]Result, int) { + n := r.r.PeekBatchCQE(r.cqeBuf) + if n == 0 { + return nil, 0 + } + out := make([]Result, 0, n) + for i := uint32(0); i < n; i++ { + cqe := r.cqeBuf[i] + req := deserialize(cqe.GetData64()) + entry, ok := r.inFlight[req.RequestID] + if ok { + delete(r.inFlight, req.RequestID) + } + out = append(out, Result{ + Op: req.Operation, + Res: cqe.Res, + Buf: entry.buf, + HdrBytes: entry.wvHdr, + }) + } + r.r.CQAdvance(n) + return out, int(n) +} + +// WaitOne blocks (with the kernel's enter-syscall deadline) until at +// least one CQE is available, then returns DrainBatch. +func (r *Ring) WaitOne() ([]Result, error) { + if _, err := r.r.WaitCQE(); err != nil { + return nil, err + } + return r.DrainBatch(), nil +} + +// InFlightLen reports how many SQEs are queued but not yet completed — +// used by tests to assert clean teardown. +func (r *Ring) InFlightLen() int { + return len(r.inFlight) +} + +// SQReady returns the number of SQEs queued but not yet submitted to the +// kernel. Useful for tests / assertions. +func (r *Ring) SQReady() uint32 { + return r.r.SQReady() +} + +// Mutex-free contract reminder: a Ring is goroutine-bound. The mutex +// field below is unused at runtime; its presence is a static signal to +// future contributors that a future concurrent-access pattern needs to +// be designed around an explicit lock or move to one ring per goroutine. +var _ sync.Mutex diff --git a/pkg/io_uring/ring_test.go b/pkg/io_uring/ring_test.go new file mode 100644 index 0000000..bb8eed7 --- /dev/null +++ b/pkg/io_uring/ring_test.go @@ -0,0 +1,360 @@ +package io_uring + +import ( + "bytes" + "runtime" + "syscall" + "testing" + "time" +) + +// socketpair returns a pair of connected AF_UNIX SOCK_DGRAM fds. Datagram +// boundaries are preserved (unlike pipe(2)), which is exactly what netlink +// behaves like, so tests using this pair mimic real netlink semantics. +func socketpair(t testing.TB) (int, int) { + t.Helper() + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) + if err != nil { + t.Fatalf("socketpair: %v", err) + } + t.Cleanup(func() { + _ = syscall.Close(fds[0]) + _ = syscall.Close(fds[1]) + }) + return fds[0], fds[1] +} + +func newTestRing(t testing.TB, recvBatch int) *Ring { + t.Helper() + if recvBatch < 1 { + recvBatch = 8 + } + r, err := New(Config{RecvBatchSize: recvBatch, CQEBatchSize: 32}) + if err != nil { + // Probe failure / kernel-too-old / io_uring disabled — skip so + // CI on older kernels doesn't fail the suite. + t.Skipf("io_uring not available on this kernel: %v", err) + } + t.Cleanup(func() { + r.Close(100*time.Millisecond, nil) + }) + return r +} + +// allocBuf returns a fresh *[]byte of size n for tests; mimics a pool +// borrow but without the pool plumbing. +func allocBuf(n int) *[]byte { + b := make([]byte, n) + return &b +} + +func TestRecvSingleDatagram(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + r := newTestRing(t, 4) + srv, cli := socketpair(t) + + // Submit one recv SQE before any data is on the wire. + buf := allocBuf(4096) + reqID, err := r.EnqueueRecvMsg(cli, buf) + if err != nil { + t.Fatalf("EnqueueRecvMsg: %v", err) + } + if _, err := r.Submit(); err != nil { + t.Fatalf("Submit: %v", err) + } + + payload := []byte("hello-netlink-shaped-bytes") + if _, err := syscall.Write(srv, payload); err != nil { + t.Fatalf("syscall.Write: %v", err) + } + + results, err := r.WaitOne() + if err != nil { + t.Fatalf("WaitOne: %v", err) + } + if len(results) != 1 { + t.Fatalf("got %d results, want 1", len(results)) + } + res := results[0] + if res.Op != OpRead { + t.Errorf("op=%d want OpRead", res.Op) + } + if res.Res != int32(len(payload)) { + t.Errorf("res=%d want %d", res.Res, len(payload)) + } + if !bytes.Equal((*res.Buf)[:res.Res], payload) { + t.Errorf("payload mismatch: got %q want %q", (*res.Buf)[:res.Res], payload) + } + if r.InFlightLen() != 0 { + t.Errorf("in-flight len=%d, want 0", r.InFlightLen()) + } + _ = reqID +} + +func TestRecvMultipleDatagrams(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + r := newTestRing(t, 16) + srv, cli := socketpair(t) + + const n = 3 + for i := 0; i < n; i++ { + if _, err := r.EnqueueRecvMsg(cli, allocBuf(4096)); err != nil { + t.Fatalf("EnqueueRecvMsg[%d]: %v", i, err) + } + } + if _, err := r.Submit(); err != nil { + t.Fatalf("Submit: %v", err) + } + + payloads := [][]byte{ + []byte("first"), + []byte("second-record-with-more-bytes"), + []byte("third"), + } + for _, p := range payloads { + if _, err := syscall.Write(srv, p); err != nil { + t.Fatalf("syscall.Write: %v", err) + } + } + + gotN := 0 + deadline := time.Now().Add(2 * time.Second) + got := make([][]byte, 0, n) + for gotN < n && time.Now().Before(deadline) { + results, err := r.WaitOne() + if err != nil { + t.Fatalf("WaitOne: %v", err) + } + for _, res := range results { + if res.Op != OpRead { + t.Errorf("op=%d want OpRead", res.Op) + } + if res.Res <= 0 { + t.Errorf("res=%d want > 0", res.Res) + continue + } + cp := make([]byte, res.Res) + copy(cp, (*res.Buf)[:res.Res]) + got = append(got, cp) + gotN++ + } + } + if gotN != n { + t.Fatalf("got %d records, want %d", gotN, n) + } + // AF_UNIX SOCK_DGRAM preserves order across one socketpair, so + // results come back in submission order. + for i, p := range payloads { + if !bytes.Equal(got[i], p) { + t.Errorf("payload[%d] mismatch: got %q want %q", i, got[i], p) + } + } + if r.InFlightLen() != 0 { + t.Errorf("in-flight len=%d, want 0", r.InFlightLen()) + } +} + +func TestSendSingle(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + r := newTestRing(t, 4) + srv, cli := socketpair(t) + + payload := []byte("test-send-via-iouring") + buf := &payload + if _, err := r.EnqueueSend(cli, buf, OpSendUnixGram); err != nil { + t.Fatalf("EnqueueSend: %v", err) + } + if _, err := r.Submit(); err != nil { + t.Fatalf("Submit: %v", err) + } + + results, err := r.WaitOne() + if err != nil { + t.Fatalf("WaitOne: %v", err) + } + if len(results) != 1 { + t.Fatalf("got %d results, want 1", len(results)) + } + if results[0].Res != int32(len(payload)) { + t.Errorf("send Res=%d want %d", results[0].Res, len(payload)) + } + + rcv := make([]byte, 4096) + n, err := syscall.Read(srv, rcv) + if err != nil { + t.Fatalf("syscall.Read: %v", err) + } + if !bytes.Equal(rcv[:n], payload) { + t.Errorf("received %q want %q", rcv[:n], payload) + } + if r.InFlightLen() != 0 { + t.Errorf("in-flight len=%d, want 0", r.InFlightLen()) + } +} + +func TestSendBatch(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + r := newTestRing(t, 256) + srv, cli := socketpair(t) + + const n = 100 + bufs := make([]*[]byte, n) + for i := 0; i < n; i++ { + p := []byte("batch-record-") + p = append(p, byte('a'+(i%26))) + bufs[i] = &p + if _, err := r.EnqueueSend(cli, bufs[i], OpSendUnixGram); err != nil { + t.Fatalf("EnqueueSend[%d]: %v", i, err) + } + } + // One Submit for the whole batch — the io_uring point. + if _, err := r.Submit(); err != nil { + t.Fatalf("Submit: %v", err) + } + + // Drain receiver in a goroutine so the writer doesn't block on a full + // kernel buffer. socketpair() defaults around 200KB; 100 small records + // shouldn't overflow, but be safe. + doneRecv := make(chan int, 1) + go func() { + count := 0 + buf := make([]byte, 4096) + for count < n { + if _, err := syscall.Read(srv, buf); err != nil { + doneRecv <- count + return + } + count++ + } + doneRecv <- count + }() + + // Reap all n CQEs. + deadline := time.Now().Add(2 * time.Second) + completions := 0 + for completions < n && time.Now().Before(deadline) { + results, err := r.WaitOne() + if err != nil { + t.Fatalf("WaitOne: %v", err) + } + completions += len(results) + } + if completions != n { + t.Errorf("got %d CQEs want %d", completions, n) + } + + got := <-doneRecv + if got != n { + t.Errorf("receiver got %d records want %d", got, n) + } + if r.InFlightLen() != 0 { + t.Errorf("in-flight len=%d, want 0", r.InFlightLen()) + } +} + +func TestWritevUnixStream(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Need SOCK_STREAM for writev semantics; socketpair() above is DGRAM. + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("socketpair stream: %v", err) + } + t.Cleanup(func() { _ = syscall.Close(fds[0]); _ = syscall.Close(fds[1]) }) + srv, cli := fds[0], fds[1] + + r := newTestRing(t, 4) + + header := []byte{0x12} // varint(18) + payload := []byte("the-eighteen-bytes") + buf := &payload + if _, err := r.EnqueueWritevUnix(cli, header, buf); err != nil { + t.Fatalf("EnqueueWritevUnix: %v", err) + } + if _, err := r.Submit(); err != nil { + t.Fatalf("Submit: %v", err) + } + + results, err := r.WaitOne() + if err != nil { + t.Fatalf("WaitOne: %v", err) + } + if len(results) != 1 || results[0].Op != OpSendUnix { + t.Fatalf("got %+v, want one OpSendUnix CQE", results) + } + wantBytes := len(header) + len(payload) + if results[0].Res != int32(wantBytes) { + t.Errorf("writev Res=%d want %d", results[0].Res, wantBytes) + } + + // Receiver should see header + payload concatenated. + rcv := make([]byte, 4096) + n, err := syscall.Read(srv, rcv) + if err != nil { + t.Fatalf("syscall.Read: %v", err) + } + got := rcv[:n] + wantConcat := append(append([]byte{}, header...), payload...) + if !bytes.Equal(got, wantConcat) { + t.Errorf("stream got %q want %q", got, wantConcat) + } + if r.InFlightLen() != 0 { + t.Errorf("in-flight len=%d, want 0", r.InFlightLen()) + } +} + +func TestInFlightCapEnforced(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + r := newTestRing(t, 4) // sqEntries clamped to 256, in-flight cap = 512 + _, cli := socketpair(t) + + // Submit enough recvs to blow past the in-flight cap. Don't drain. + hit := false + for i := 0; i < r.inFlightCap+2; i++ { + if _, err := r.EnqueueRecvMsg(cli, allocBuf(64)); err != nil { + hit = true + break + } + } + if !hit { + t.Fatalf("expected EnqueueRecvMsg to refuse past in-flight cap=%d", r.inFlightCap) + } +} + +func TestTeardownDrainsCleanly(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + r, err := New(Config{RecvBatchSize: 4, CQEBatchSize: 8}) + if err != nil { + t.Skipf("io_uring not available: %v", err) + } + srv, cli := socketpair(t) + + if _, err := r.EnqueueRecvMsg(cli, allocBuf(64)); err != nil { + t.Fatalf("EnqueueRecvMsg: %v", err) + } + if _, err := r.Submit(); err != nil { + t.Fatalf("Submit: %v", err) + } + if _, err := syscall.Write(srv, []byte("x")); err != nil { + t.Fatalf("Write: %v", err) + } + + var drained int + r.Close(500*time.Millisecond, func(Result) { drained++ }) + if drained != 1 { + t.Errorf("Close drained %d CQEs, want 1", drained) + } +} diff --git a/pkg/xtcp_config/xtcp_config.pb.go b/pkg/xtcp_config/xtcp_config.pb.go index 7c49592..4289587 100644 --- a/pkg/xtcp_config/xtcp_config.pb.go +++ b/pkg/xtcp_config/xtcp_config.pb.go @@ -377,8 +377,22 @@ type XtcpConfig struct { // GRPC listening port GrpcPort uint32 `protobuf:"varint,190,opt,name=grpc_port,json=grpcPort,proto3" json:"grpc_port,omitempty"` EnabledDeserializers *EnabledDeserializers `protobuf:"bytes,200,opt,name=enabled_deserializers,json=enabledDeserializers,proto3" json:"enabled_deserializers,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + // When true, route netlink reads and raw-socket destination writes + // through an io_uring ring per Netlinker. Requires Linux 6.1+. + // Library-backed destinations (kafka, nsq, nats, valkey) ignore this + // flag — they continue to use their own client sockets unchanged. + IoUring bool `protobuf:"varint,210,opt,name=io_uring,json=ioUring,proto3" json:"io_uring,omitempty"` + // Number of recvmsg SQEs kept in flight per Netlinker ring. Higher + // values reduce io_uring_enter syscalls per dump cycle on hosts with + // many sockets, at the cost of more pinned buffers from packet pool. + // Ignored unless io_uring=true. Default 64. + IoUringRecvBatchSize uint32 `protobuf:"varint,211,opt,name=io_uring_recv_batch_size,json=ioUringRecvBatchSize,proto3" json:"io_uring_recv_batch_size,omitempty"` + // Maximum CQEs reaped per PeekBatchCQE call. Larger batches amortise + // userland loop overhead but increase scheduling latency for the + // netlinker goroutine. Ignored unless io_uring=true. Default 128. + IoUringCqeBatchSize uint32 `protobuf:"varint,212,opt,name=io_uring_cqe_batch_size,json=ioUringCqeBatchSize,proto3" json:"io_uring_cqe_batch_size,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *XtcpConfig) Reset() { @@ -586,6 +600,27 @@ func (x *XtcpConfig) GetEnabledDeserializers() *EnabledDeserializers { return nil } +func (x *XtcpConfig) GetIoUring() bool { + if x != nil { + return x.IoUring + } + return false +} + +func (x *XtcpConfig) GetIoUringRecvBatchSize() uint32 { + if x != nil { + return x.IoUringRecvBatchSize + } + return 0 +} + +func (x *XtcpConfig) GetIoUringCqeBatchSize() uint32 { + if x != nil { + return x.IoUringCqeBatchSize + } + return 0 +} + type EnabledDeserializers struct { state protoimpl.MessageState `protogen:"open.v1"` Enabled map[string]bool `protobuf:"bytes,1,rep,name=enabled,proto3" json:"enabled,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"varint,2,opt,name=value"` @@ -649,7 +684,7 @@ const file_xtcp_config_v1_xtcp_config_proto_rawDesc = "" + "\fpoll_timeout\x18\x1e \x01(\v2\x19.google.protobuf.DurationB\x11\xbaH\x0e\xc8\x01\x01\xaa\x01\b\"\x04\b\x80\xf5$2\x00R\vpollTimeout:s\xbaHp\x1an\n" + "\x0fXtcpConfig.poll\x122Poll timeout must be less than poll poll_frequency\x1a'this.poll_timeout < this.poll_frequency\"N\n" + "\x18SetPollFrequencyResponse\x122\n" + - "\x06config\x18\x01 \x01(\v2\x1a.xtcp_config.v1.XtcpConfigR\x06config\"\x84\f\n" + + "\x06config\x18\x01 \x01(\v2\x1a.xtcp_config.v1.XtcpConfigR\x06config\"\xb6\r\n" + "\n" + "XtcpConfig\x12F\n" + "\x17nl_timeout_milliseconds\x18\n" + @@ -691,7 +726,12 @@ const file_xtcp_config_v1_xtcp_config_proto_rawDesc = "" + "\x03tag\x18\xb4\x01 \x01(\tB\n" + "\xbaH\a\xc8\x01\x00r\x02\x18(R\x03tag\x12,\n" + "\tgrpc_port\x18\xbe\x01 \x01(\rB\x0e\xbaH\v\xc8\x01\x01*\x06\x18\xff\xff\x03(\x01R\bgrpcPort\x12b\n" + - "\x15enabled_deserializers\x18\xc8\x01 \x01(\v2$.xtcp_config.v1.EnabledDeserializersB\x06\xbaH\x03\xc8\x01\x00R\x14enabledDeserializers:s\xbaHp\x1an\n" + + "\x15enabled_deserializers\x18\xc8\x01 \x01(\v2$.xtcp_config.v1.EnabledDeserializersB\x06\xbaH\x03\xc8\x01\x00R\x14enabledDeserializers\x12\"\n" + + "\bio_uring\x18\xd2\x01 \x01(\bB\x06\xbaH\x03\xc8\x01\x00R\aioUring\x12F\n" + + "\x18io_uring_recv_batch_size\x18\xd3\x01 \x01(\rB\r\xbaH\n" + + "\xc8\x01\x00*\x05\x18\x80 (\x01R\x14ioUringRecvBatchSize\x12D\n" + + "\x17io_uring_cqe_batch_size\x18\xd4\x01 \x01(\rB\r\xbaH\n" + + "\xc8\x01\x00*\x05\x18\x80 (\x01R\x13ioUringCqeBatchSize:s\xbaHp\x1an\n" + "\x0fXtcpConfig.poll\x122Poll timeout must be less than poll poll_frequency\x1a'this.poll_frequency > this.poll_timeout\"\x9f\x01\n" + "\x14EnabledDeserializers\x12K\n" + "\aenabled\x18\x01 \x03(\v21.xtcp_config.v1.EnabledDeserializers.EnabledEntryR\aenabled\x1a:\n" + diff --git a/proto/xtcp_config/v1/xtcp_config.proto b/proto/xtcp_config/v1/xtcp_config.proto index 11a1c79..200c590 100644 --- a/proto/xtcp_config/v1/xtcp_config.proto +++ b/proto/xtcp_config/v1/xtcp_config.proto @@ -350,6 +350,35 @@ message XtcpConfig { EnabledDeserializers enabled_deserializers = 200 [ (buf.validate.field).required = false ]; + + // When true, route netlink reads and raw-socket destination writes + // through an io_uring ring per Netlinker. Requires Linux 6.1+. + // Library-backed destinations (kafka, nsq, nats, valkey) ignore this + // flag — they continue to use their own client sockets unchanged. + bool io_uring = 210 [ + (buf.validate.field).required = false + ]; + + // Number of recvmsg SQEs kept in flight per Netlinker ring. Higher + // values reduce io_uring_enter syscalls per dump cycle on hosts with + // many sockets, at the cost of more pinned buffers from packet pool. + // Ignored unless io_uring=true. Default 64. + uint32 io_uring_recv_batch_size = 211 [ + (buf.validate.field).required = false, + (buf.validate.field).uint32 = { + gte: 1, + lte: 4096 + }]; + + // Maximum CQEs reaped per PeekBatchCQE call. Larger batches amortise + // userland loop overhead but increase scheduling latency for the + // netlinker goroutine. Ignored unless io_uring=true. Default 128. + uint32 io_uring_cqe_batch_size = 212 [ + (buf.validate.field).required = false, + (buf.validate.field).uint32 = { + gte: 1, + lte: 4096 + }]; }; message EnabledDeserializers { diff --git a/python/xtcp_config/v1/xtcp_config_pb2.py b/python/xtcp_config/v1/xtcp_config_pb2.py index 8f1e4d4..360f5d3 100644 --- a/python/xtcp_config/v1/xtcp_config_pb2.py +++ b/python/xtcp_config/v1/xtcp_config_pb2.py @@ -27,7 +27,7 @@ from buf.validate import validate_pb2 as buf_dot_validate_dot_validate__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n xtcp_config/v1/xtcp_config.proto\x12\x0extcp_config.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1cgoogle/api/annotations.proto\x1a\x1b\x62uf/validate/validate.proto\"\x0c\n\nGetRequest\"A\n\x0bGetResponse\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"@\n\nSetRequest\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"A\n\x0bSetResponse\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"\xb4\x02\n\x17SetPollFrequencyRequest\x12S\n\x0epoll_frequency\x18\x14 \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$2\x00\xc8\x01\x01R\rpollFrequency\x12O\n\x0cpoll_timeout\x18\x1e \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$2\x00\xc8\x01\x01R\x0bpollTimeout:s\xbaHp\x1an\n\x0fXtcpConfig.poll\x12\x32Poll timeout must be less than poll poll_frequency\x1a\'this.poll_timeout < this.poll_frequency\"N\n\x18SetPollFrequencyResponse\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"\x84\x0c\n\nXtcpConfig\x12\x46\n\x17nl_timeout_milliseconds\x18\n \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xa0\x8d\x06(\x00\xc8\x01\x01R\x15nlTimeoutMilliseconds\x12S\n\x0epoll_frequency\x18\x14 \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$*\x00\xc8\x01\x01R\rpollFrequency\x12O\n\x0cpoll_timeout\x18\x1e \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$*\x00\xc8\x01\x01R\x0bpollTimeout\x12+\n\tmax_loops\x18( \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xa0\x8d\x06(\x00\xc8\x01\x00R\x08maxLoops\x12,\n\nnetlinkers\x18\x32 \x01(\rB\x0c\xbaH\t*\x04\x18\x64(\x01\xc8\x01\x01R\nnetlinkers\x12H\n\x19netlinkers_done_chan_size\x18\x33 \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x01\xc8\x01\x01R\x16netlinkersDoneChanSize\x12*\n\tnlmsg_seq\x18< \x01(\rB\r\xbaH\n*\x05\x18\x90N(\x00\xc8\x01\x01R\x08nlmsgSeq\x12/\n\x0bpacket_size\x18\x46 \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xc0\x84=(\x00\xc8\x01\x00R\npacketSize\x12\x36\n\x10packet_size_mply\x18P \x01(\rB\x0c\xbaH\t*\x04\x18\x64(\x00\xc8\x01\x00R\x0epacketSizeMply\x12.\n\x0bwrite_files\x18Z \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x00\xc8\x01\x00R\nwriteFiles\x12/\n\x0c\x63\x61pture_path\x18\x64 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18P\xc8\x01\x00R\x0b\x63\x61pturePath\x12(\n\x07modulus\x18n \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xc0\x84=(\x01\xc8\x01\x01R\x07modulus\x12+\n\nmarshal_to\x18x \x01(\tB\x0c\xbaH\tr\x04\x10\x04\x18(\xc8\x01\x01R\tmarshalTo\x12G\n\x1cprotobuf_list_length_delimit\x18y \x01(\x08\x42\x06\xbaH\x03\xc8\x01\x00R\x19protobufListLengthDelimit\x12\"\n\x04\x64\x65st\x18\x82\x01 \x01(\tB\r\xbaH\nr\x05\x10\x04\x18\x80\x01\xc8\x01\x01R\x04\x64\x65st\x12\x38\n\x10\x64\x65st_write_files\x18\x87\x01 \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x00\xc8\x01\x00R\x0e\x64\x65stWriteFiles\x12#\n\x05topic\x18\x8c\x01 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18(\xc8\x01\x00R\x05topic\x12\x35\n\x0fxtcp_proto_file\x18\x8f\x01 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18P\xc8\x01\x00R\rxtcpProtoFile\x12\x37\n\x10kafka_schema_url\x18\x91\x01 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18<\xc8\x01\x00R\x0ekafkaSchemaUrl\x12`\n\x15kafka_produce_timeout\x18\x96\x01 \x01(\x0b\x32\x19.google.protobuf.DurationB\x10\xbaH\r\xaa\x01\x07\"\x03\x08\xd8\x04\x32\x00\xc8\x01\x00R\x13kafkaProduceTimeout\x12/\n\x0b\x64\x65\x62ug_level\x18\xa0\x01 \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x00\xc8\x01\x01R\ndebugLevel\x12!\n\x05label\x18\xaa\x01 \x01(\tB\n\xbaH\x07r\x02\x18(\xc8\x01\x00R\x05label\x12\x1d\n\x03tag\x18\xb4\x01 \x01(\tB\n\xbaH\x07r\x02\x18(\xc8\x01\x00R\x03tag\x12,\n\tgrpc_port\x18\xbe\x01 \x01(\rB\x0e\xbaH\x0b*\x06\x18\xff\xff\x03(\x01\xc8\x01\x01R\x08grpcPort\x12\x62\n\x15\x65nabled_deserializers\x18\xc8\x01 \x01(\x0b\x32$.xtcp_config.v1.EnabledDeserializersB\x06\xbaH\x03\xc8\x01\x00R\x14\x65nabledDeserializers:s\xbaHp\x1an\n\x0fXtcpConfig.poll\x12\x32Poll timeout must be less than poll poll_frequency\x1a\'this.poll_frequency > this.poll_timeout\"\x9f\x01\n\x14\x45nabledDeserializers\x12K\n\x07\x65nabled\x18\x01 \x03(\x0b\x32\x31.xtcp_config.v1.EnabledDeserializers.EnabledEntryR\x07\x65nabled\x1a:\n\x0c\x45nabledEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x08R\x05value:\x02\x38\x01\x32\xe1\x02\n\rConfigService\x12]\n\x03Get\x12\x1a.xtcp_config.v1.GetRequest\x1a\x1b.xtcp_config.v1.GetResponse\"\x1d\x82\xd3\xe4\x93\x02\x17\x1a\x12/ConfigService/Get:\x01*\x12]\n\x03Set\x12\x1a.xtcp_config.v1.SetRequest\x1a\x1b.xtcp_config.v1.SetResponse\"\x1d\x82\xd3\xe4\x93\x02\x17\x1a\x12/ConfigService/Set:\x01*\x12\x91\x01\n\x10SetPollFrequency\x12\'.xtcp_config.v1.SetPollFrequencyRequest\x1a(.xtcp_config.v1.SetPollFrequencyResponse\"*\x82\xd3\xe4\x93\x02$\x1a\x1f/ConfigService/SetPollFrequency:\x01*B\x8d\x01\n\x12\x63om.xtcp_config.v1B\x0fXtcpConfigProtoP\x01Z\x11./pkg/xtcp_config\xa2\x02\x03XXX\xaa\x02\rXtcpConfig.V1\xca\x02\rXtcpConfig\\V1\xe2\x02\x19XtcpConfig\\V1\\GPBMetadata\xea\x02\x0eXtcpConfig::V1b\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n xtcp_config/v1/xtcp_config.proto\x12\x0extcp_config.v1\x1a\x1egoogle/protobuf/duration.proto\x1a\x1cgoogle/api/annotations.proto\x1a\x1b\x62uf/validate/validate.proto\"\x0c\n\nGetRequest\"A\n\x0bGetResponse\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"@\n\nSetRequest\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"A\n\x0bSetResponse\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"\xb4\x02\n\x17SetPollFrequencyRequest\x12S\n\x0epoll_frequency\x18\x14 \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$2\x00\xc8\x01\x01R\rpollFrequency\x12O\n\x0cpoll_timeout\x18\x1e \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$2\x00\xc8\x01\x01R\x0bpollTimeout:s\xbaHp\x1an\n\x0fXtcpConfig.poll\x12\x32Poll timeout must be less than poll poll_frequency\x1a\'this.poll_timeout < this.poll_frequency\"N\n\x18SetPollFrequencyResponse\x12\x32\n\x06\x63onfig\x18\x01 \x01(\x0b\x32\x1a.xtcp_config.v1.XtcpConfigR\x06\x63onfig\"\xb6\r\n\nXtcpConfig\x12\x46\n\x17nl_timeout_milliseconds\x18\n \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xa0\x8d\x06(\x00\xc8\x01\x01R\x15nlTimeoutMilliseconds\x12S\n\x0epoll_frequency\x18\x14 \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$*\x00\xc8\x01\x01R\rpollFrequency\x12O\n\x0cpoll_timeout\x18\x1e \x01(\x0b\x32\x19.google.protobuf.DurationB\x11\xbaH\x0e\xaa\x01\x08\"\x04\x08\x80\xf5$*\x00\xc8\x01\x01R\x0bpollTimeout\x12+\n\tmax_loops\x18( \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xa0\x8d\x06(\x00\xc8\x01\x00R\x08maxLoops\x12,\n\nnetlinkers\x18\x32 \x01(\rB\x0c\xbaH\t*\x04\x18\x64(\x01\xc8\x01\x01R\nnetlinkers\x12H\n\x19netlinkers_done_chan_size\x18\x33 \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x01\xc8\x01\x01R\x16netlinkersDoneChanSize\x12*\n\tnlmsg_seq\x18< \x01(\rB\r\xbaH\n*\x05\x18\x90N(\x00\xc8\x01\x01R\x08nlmsgSeq\x12/\n\x0bpacket_size\x18\x46 \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xc0\x84=(\x00\xc8\x01\x00R\npacketSize\x12\x36\n\x10packet_size_mply\x18P \x01(\rB\x0c\xbaH\t*\x04\x18\x64(\x00\xc8\x01\x00R\x0epacketSizeMply\x12.\n\x0bwrite_files\x18Z \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x00\xc8\x01\x00R\nwriteFiles\x12/\n\x0c\x63\x61pture_path\x18\x64 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18P\xc8\x01\x00R\x0b\x63\x61pturePath\x12(\n\x07modulus\x18n \x01(\x04\x42\x0e\xbaH\x0b\x32\x06\x18\xc0\x84=(\x01\xc8\x01\x01R\x07modulus\x12+\n\nmarshal_to\x18x \x01(\tB\x0c\xbaH\tr\x04\x10\x04\x18(\xc8\x01\x01R\tmarshalTo\x12G\n\x1cprotobuf_list_length_delimit\x18y \x01(\x08\x42\x06\xbaH\x03\xc8\x01\x00R\x19protobufListLengthDelimit\x12\"\n\x04\x64\x65st\x18\x82\x01 \x01(\tB\r\xbaH\nr\x05\x10\x04\x18\x80\x01\xc8\x01\x01R\x04\x64\x65st\x12\x38\n\x10\x64\x65st_write_files\x18\x87\x01 \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x00\xc8\x01\x00R\x0e\x64\x65stWriteFiles\x12#\n\x05topic\x18\x8c\x01 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18(\xc8\x01\x00R\x05topic\x12\x35\n\x0fxtcp_proto_file\x18\x8f\x01 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18P\xc8\x01\x00R\rxtcpProtoFile\x12\x37\n\x10kafka_schema_url\x18\x91\x01 \x01(\tB\x0c\xbaH\tr\x04\x10\x01\x18<\xc8\x01\x00R\x0ekafkaSchemaUrl\x12`\n\x15kafka_produce_timeout\x18\x96\x01 \x01(\x0b\x32\x19.google.protobuf.DurationB\x10\xbaH\r\xaa\x01\x07\"\x03\x08\xd8\x04\x32\x00\xc8\x01\x00R\x13kafkaProduceTimeout\x12/\n\x0b\x64\x65\x62ug_level\x18\xa0\x01 \x01(\rB\r\xbaH\n*\x05\x18\xe8\x07(\x00\xc8\x01\x01R\ndebugLevel\x12!\n\x05label\x18\xaa\x01 \x01(\tB\n\xbaH\x07r\x02\x18(\xc8\x01\x00R\x05label\x12\x1d\n\x03tag\x18\xb4\x01 \x01(\tB\n\xbaH\x07r\x02\x18(\xc8\x01\x00R\x03tag\x12,\n\tgrpc_port\x18\xbe\x01 \x01(\rB\x0e\xbaH\x0b*\x06\x18\xff\xff\x03(\x01\xc8\x01\x01R\x08grpcPort\x12\x62\n\x15\x65nabled_deserializers\x18\xc8\x01 \x01(\x0b\x32$.xtcp_config.v1.EnabledDeserializersB\x06\xbaH\x03\xc8\x01\x00R\x14\x65nabledDeserializers\x12\"\n\x08io_uring\x18\xd2\x01 \x01(\x08\x42\x06\xbaH\x03\xc8\x01\x00R\x07ioUring\x12\x46\n\x18io_uring_recv_batch_size\x18\xd3\x01 \x01(\rB\r\xbaH\n*\x05\x18\x80 (\x01\xc8\x01\x00R\x14ioUringRecvBatchSize\x12\x44\n\x17io_uring_cqe_batch_size\x18\xd4\x01 \x01(\rB\r\xbaH\n*\x05\x18\x80 (\x01\xc8\x01\x00R\x13ioUringCqeBatchSize:s\xbaHp\x1an\n\x0fXtcpConfig.poll\x12\x32Poll timeout must be less than poll poll_frequency\x1a\'this.poll_frequency > this.poll_timeout\"\x9f\x01\n\x14\x45nabledDeserializers\x12K\n\x07\x65nabled\x18\x01 \x03(\x0b\x32\x31.xtcp_config.v1.EnabledDeserializers.EnabledEntryR\x07\x65nabled\x1a:\n\x0c\x45nabledEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\x08R\x05value:\x02\x38\x01\x32\xe1\x02\n\rConfigService\x12]\n\x03Get\x12\x1a.xtcp_config.v1.GetRequest\x1a\x1b.xtcp_config.v1.GetResponse\"\x1d\x82\xd3\xe4\x93\x02\x17\x1a\x12/ConfigService/Get:\x01*\x12]\n\x03Set\x12\x1a.xtcp_config.v1.SetRequest\x1a\x1b.xtcp_config.v1.SetResponse\"\x1d\x82\xd3\xe4\x93\x02\x17\x1a\x12/ConfigService/Set:\x01*\x12\x91\x01\n\x10SetPollFrequency\x12\'.xtcp_config.v1.SetPollFrequencyRequest\x1a(.xtcp_config.v1.SetPollFrequencyResponse\"*\x82\xd3\xe4\x93\x02$\x1a\x1f/ConfigService/SetPollFrequency:\x01*B\x8d\x01\n\x12\x63om.xtcp_config.v1B\x0fXtcpConfigProtoP\x01Z\x11./pkg/xtcp_config\xa2\x02\x03XXX\xaa\x02\rXtcpConfig.V1\xca\x02\rXtcpConfig\\V1\xe2\x02\x19XtcpConfig\\V1\\GPBMetadata\xea\x02\x0eXtcpConfig::V1b\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -91,6 +91,12 @@ _globals['_XTCPCONFIG'].fields_by_name['grpc_port']._serialized_options = b'\272H\013*\006\030\377\377\003(\001\310\001\001' _globals['_XTCPCONFIG'].fields_by_name['enabled_deserializers']._loaded_options = None _globals['_XTCPCONFIG'].fields_by_name['enabled_deserializers']._serialized_options = b'\272H\003\310\001\000' + _globals['_XTCPCONFIG'].fields_by_name['io_uring']._loaded_options = None + _globals['_XTCPCONFIG'].fields_by_name['io_uring']._serialized_options = b'\272H\003\310\001\000' + _globals['_XTCPCONFIG'].fields_by_name['io_uring_recv_batch_size']._loaded_options = None + _globals['_XTCPCONFIG'].fields_by_name['io_uring_recv_batch_size']._serialized_options = b'\272H\n*\005\030\200 (\001\310\001\000' + _globals['_XTCPCONFIG'].fields_by_name['io_uring_cqe_batch_size']._loaded_options = None + _globals['_XTCPCONFIG'].fields_by_name['io_uring_cqe_batch_size']._serialized_options = b'\272H\n*\005\030\200 (\001\310\001\000' _globals['_XTCPCONFIG']._loaded_options = None _globals['_XTCPCONFIG']._serialized_options = b'\272Hp\032n\n\017XtcpConfig.poll\0222Poll timeout must be less than poll poll_frequency\032\'this.poll_frequency > this.poll_timeout' _globals['_ENABLEDDESERIALIZERS_ENABLEDENTRY']._loaded_options = None @@ -114,11 +120,11 @@ _globals['_SETPOLLFREQUENCYRESPONSE']._serialized_start=668 _globals['_SETPOLLFREQUENCYRESPONSE']._serialized_end=746 _globals['_XTCPCONFIG']._serialized_start=749 - _globals['_XTCPCONFIG']._serialized_end=2289 - _globals['_ENABLEDDESERIALIZERS']._serialized_start=2292 - _globals['_ENABLEDDESERIALIZERS']._serialized_end=2451 - _globals['_ENABLEDDESERIALIZERS_ENABLEDENTRY']._serialized_start=2393 - _globals['_ENABLEDDESERIALIZERS_ENABLEDENTRY']._serialized_end=2451 - _globals['_CONFIGSERVICE']._serialized_start=2454 - _globals['_CONFIGSERVICE']._serialized_end=2807 + _globals['_XTCPCONFIG']._serialized_end=2467 + _globals['_ENABLEDDESERIALIZERS']._serialized_start=2470 + _globals['_ENABLEDDESERIALIZERS']._serialized_end=2629 + _globals['_ENABLEDDESERIALIZERS_ENABLEDENTRY']._serialized_start=2571 + _globals['_ENABLEDDESERIALIZERS_ENABLEDENTRY']._serialized_end=2629 + _globals['_CONFIGSERVICE']._serialized_start=2632 + _globals['_CONFIGSERVICE']._serialized_end=2985 # @@protoc_insertion_point(module_scope) diff --git a/python/xtcp_config/v1/xtcp_config_pb2.pyi b/python/xtcp_config/v1/xtcp_config_pb2.pyi index 8494c0e..93971a3 100644 --- a/python/xtcp_config/v1/xtcp_config_pb2.pyi +++ b/python/xtcp_config/v1/xtcp_config_pb2.pyi @@ -46,7 +46,7 @@ class SetPollFrequencyResponse(_message.Message): def __init__(self, config: _Optional[_Union[XtcpConfig, _Mapping]] = ...) -> None: ... class XtcpConfig(_message.Message): - __slots__ = ("nl_timeout_milliseconds", "poll_frequency", "poll_timeout", "max_loops", "netlinkers", "netlinkers_done_chan_size", "nlmsg_seq", "packet_size", "packet_size_mply", "write_files", "capture_path", "modulus", "marshal_to", "protobuf_list_length_delimit", "dest", "dest_write_files", "topic", "xtcp_proto_file", "kafka_schema_url", "kafka_produce_timeout", "debug_level", "label", "tag", "grpc_port", "enabled_deserializers") + __slots__ = ("nl_timeout_milliseconds", "poll_frequency", "poll_timeout", "max_loops", "netlinkers", "netlinkers_done_chan_size", "nlmsg_seq", "packet_size", "packet_size_mply", "write_files", "capture_path", "modulus", "marshal_to", "protobuf_list_length_delimit", "dest", "dest_write_files", "topic", "xtcp_proto_file", "kafka_schema_url", "kafka_produce_timeout", "debug_level", "label", "tag", "grpc_port", "enabled_deserializers", "io_uring", "io_uring_recv_batch_size", "io_uring_cqe_batch_size") NL_TIMEOUT_MILLISECONDS_FIELD_NUMBER: _ClassVar[int] POLL_FREQUENCY_FIELD_NUMBER: _ClassVar[int] POLL_TIMEOUT_FIELD_NUMBER: _ClassVar[int] @@ -72,6 +72,9 @@ class XtcpConfig(_message.Message): TAG_FIELD_NUMBER: _ClassVar[int] GRPC_PORT_FIELD_NUMBER: _ClassVar[int] ENABLED_DESERIALIZERS_FIELD_NUMBER: _ClassVar[int] + IO_URING_FIELD_NUMBER: _ClassVar[int] + IO_URING_RECV_BATCH_SIZE_FIELD_NUMBER: _ClassVar[int] + IO_URING_CQE_BATCH_SIZE_FIELD_NUMBER: _ClassVar[int] nl_timeout_milliseconds: int poll_frequency: _duration_pb2.Duration poll_timeout: _duration_pb2.Duration @@ -97,7 +100,10 @@ class XtcpConfig(_message.Message): tag: str grpc_port: int enabled_deserializers: EnabledDeserializers - def __init__(self, nl_timeout_milliseconds: _Optional[int] = ..., poll_frequency: _Optional[_Union[_duration_pb2.Duration, _Mapping]] = ..., poll_timeout: _Optional[_Union[_duration_pb2.Duration, _Mapping]] = ..., max_loops: _Optional[int] = ..., netlinkers: _Optional[int] = ..., netlinkers_done_chan_size: _Optional[int] = ..., nlmsg_seq: _Optional[int] = ..., packet_size: _Optional[int] = ..., packet_size_mply: _Optional[int] = ..., write_files: _Optional[int] = ..., capture_path: _Optional[str] = ..., modulus: _Optional[int] = ..., marshal_to: _Optional[str] = ..., protobuf_list_length_delimit: bool = ..., dest: _Optional[str] = ..., dest_write_files: _Optional[int] = ..., topic: _Optional[str] = ..., xtcp_proto_file: _Optional[str] = ..., kafka_schema_url: _Optional[str] = ..., kafka_produce_timeout: _Optional[_Union[_duration_pb2.Duration, _Mapping]] = ..., debug_level: _Optional[int] = ..., label: _Optional[str] = ..., tag: _Optional[str] = ..., grpc_port: _Optional[int] = ..., enabled_deserializers: _Optional[_Union[EnabledDeserializers, _Mapping]] = ...) -> None: ... + io_uring: bool + io_uring_recv_batch_size: int + io_uring_cqe_batch_size: int + def __init__(self, nl_timeout_milliseconds: _Optional[int] = ..., poll_frequency: _Optional[_Union[_duration_pb2.Duration, _Mapping]] = ..., poll_timeout: _Optional[_Union[_duration_pb2.Duration, _Mapping]] = ..., max_loops: _Optional[int] = ..., netlinkers: _Optional[int] = ..., netlinkers_done_chan_size: _Optional[int] = ..., nlmsg_seq: _Optional[int] = ..., packet_size: _Optional[int] = ..., packet_size_mply: _Optional[int] = ..., write_files: _Optional[int] = ..., capture_path: _Optional[str] = ..., modulus: _Optional[int] = ..., marshal_to: _Optional[str] = ..., protobuf_list_length_delimit: bool = ..., dest: _Optional[str] = ..., dest_write_files: _Optional[int] = ..., topic: _Optional[str] = ..., xtcp_proto_file: _Optional[str] = ..., kafka_schema_url: _Optional[str] = ..., kafka_produce_timeout: _Optional[_Union[_duration_pb2.Duration, _Mapping]] = ..., debug_level: _Optional[int] = ..., label: _Optional[str] = ..., tag: _Optional[str] = ..., grpc_port: _Optional[int] = ..., enabled_deserializers: _Optional[_Union[EnabledDeserializers, _Mapping]] = ..., io_uring: bool = ..., io_uring_recv_batch_size: _Optional[int] = ..., io_uring_cqe_batch_size: _Optional[int] = ...) -> None: ... class EnabledDeserializers(_message.Message): __slots__ = ("enabled",) diff --git a/xtcp_config/v1/xtcp_config.swagger.json b/xtcp_config/v1/xtcp_config.swagger.json index dd782f6..7932ff6 100644 --- a/xtcp_config/v1/xtcp_config.swagger.json +++ b/xtcp_config/v1/xtcp_config.swagger.json @@ -315,6 +315,20 @@ }, "enabledDeserializers": { "$ref": "#/definitions/v1EnabledDeserializers" + }, + "ioUring": { + "type": "boolean", + "description": "When true, route netlink reads and raw-socket destination writes\nthrough an io_uring ring per Netlinker. Requires Linux 6.1+.\nLibrary-backed destinations (kafka, nsq, nats, valkey) ignore this\nflag — they continue to use their own client sockets unchanged." + }, + "ioUringRecvBatchSize": { + "type": "integer", + "format": "int64", + "description": "Number of recvmsg SQEs kept in flight per Netlinker ring. Higher\nvalues reduce io_uring_enter syscalls per dump cycle on hosts with\nmany sockets, at the cost of more pinned buffers from packet pool.\nIgnored unless io_uring=true. Default 64." + }, + "ioUringCqeBatchSize": { + "type": "integer", + "format": "int64", + "description": "Maximum CQEs reaped per PeekBatchCQE call. Larger batches amortise\nuserland loop overhead but increase scheduling latency for the\nnetlinker goroutine. Ignored unless io_uring=true. Default 128." } }, "title": "xtcp configuration" From 6993e053c7ba37b43f37d8f239b88dee27eead5c Mon Sep 17 00:00:00 2001 From: randomizedcoder Date: Wed, 13 May 2026 09:43:00 -0700 Subject: [PATCH 2/4] io_uring opt-in path: Netlinker, destinations, CLI flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the io_uring package from b94b735 into the xtcp2 hot path. New opt-in code path activated when config.IoUring=true (--ioUring CLI flag); the default path is unchanged for any existing user. Read side (pkg/xtcp/netlinker_iouring.go + init_netlinkers.go): - New netlinkerIoUring goroutine variant. Pre-submits a configurable batch of recvmsg SQEs (--ioUringRecvBatch, default 64) from packetBufferPool against the netlink fd, drains CQEs via PeekBatchCQE, refills each completed slot inline. One io_uring_enter per Submit instead of one per recv — on a 100k-socket host this is expected to reduce syscalls ~20-50x. - Dispatch follows the codebase's existing sync.Map + chosen-function pattern (Marshallers / Destinations). x.Netlinker is a function pointer set at init; ns_createNetlinkersAndStore.go invokes it unchanged. - LockOSThread before ring creation so the ring stays bound to the netns'd thread for its lifetime. - WaitCQETimeout caps each iteration so ctx cancellation is observed within nl_timeout_milliseconds. Write side (pkg/xtcp/destinations_iouring.go + extractFD helper): - destUDPIoUring / destUnixIoUring / destUnixGramIoUring enqueue send SQEs into the per-Netlinker ring (looked up via ringCtxKey on the ctx). Submit happens after Deserialize returns, so a whole dump cycle of N records turns into one io_uring_enter for all N sends. - destUnixIoUring uses writev with [hdr, payload] iovec so the varint-length frame is delivered atomically. - Buffer ownership: marshalled *[]byte is pinned by the ring's in-flight map until the kernel signals completion. handleSendCQE records the outcome to Prometheus (mirrors destKafka's async callback at destinations.go:117-123). - Library destinations (kafka, nsq, nats, valkey) silently ignore the io_uring flag — they own their own sockets via client libraries. Tests (pkg/xtcp/destinations_test.go): - New TestDestinationsIoUring table-driven test covers udp, unix, unixgram io_uring round-trip with single and multiple records each. - All 6 rows pass; race-clean across 10 stress runs. - The flake "Submit: file exists" surfaced during stress and was diagnosed: ring SQ submission must happen on the same OS thread that created the ring. Fix is runtime.LockOSThread() in the test driver (same lock the production netlinker already uses). CLI (cmd/xtcp2/xtcp2.go): - --ioUring (bool) - --ioUringRecvBatch (uint, default 64) - --ioUringCqeBatch (uint, default 128) Co-Authored-By: Claude Opus 4.7 --- cmd/xtcp2/xtcp2.go | 8 + pkg/io_uring/ring.go | 14 ++ pkg/xtcp/destinations_iouring.go | 82 +++++++++ pkg/xtcp/destinations_test.go | 147 +++++++++++++++ pkg/xtcp/init.go | 3 + pkg/xtcp/init_destinations.go | 81 ++++++++- pkg/xtcp/init_netlinkers.go | 47 +++++ pkg/xtcp/netlinker.go | 62 +++---- pkg/xtcp/netlinker_iouring.go | 296 +++++++++++++++++++++++++++++++ pkg/xtcp/xtcp.go | 20 +++ 10 files changed, 717 insertions(+), 43 deletions(-) create mode 100644 pkg/xtcp/destinations_iouring.go create mode 100644 pkg/xtcp/init_netlinkers.go create mode 100644 pkg/xtcp/netlinker_iouring.go diff --git a/cmd/xtcp2/xtcp2.go b/cmd/xtcp2/xtcp2.go index 892f89d..df480db 100644 --- a/cmd/xtcp2/xtcp2.go +++ b/cmd/xtcp2/xtcp2.go @@ -174,6 +174,10 @@ func main() { d := flag.Uint("d", debugLevelCst, "debug level") + ioUring := flag.Bool("ioUring", false, "Opt in to io_uring for netlink reads and raw-socket destination writes (Linux 6.1+)") + ioUringRecvBatch := flag.Uint("ioUringRecvBatch", 64, "io_uring recvmsg SQEs kept in flight per Netlinker (1-4096). Higher reduces syscalls on high-fanout hosts.") + ioUringCqeBatch := flag.Uint("ioUringCqeBatch", 128, "io_uring max CQEs reaped per PeekBatchCQE call (1-4096)") + flag.Parse() // Print version information passed in via ldflags in the Makefile @@ -240,6 +244,10 @@ func main() { Tag: *tag, GrpcPort: uint32(*grpcPort), EnabledDeserializers: des, + + IoUring: *ioUring, + IoUringRecvBatchSize: uint32(*ioUringRecvBatch), + IoUringCqeBatchSize: uint32(*ioUringCqeBatch), } if debugLevel > 100 { diff --git a/pkg/io_uring/ring.go b/pkg/io_uring/ring.go index 785c2eb..be24029 100644 --- a/pkg/io_uring/ring.go +++ b/pkg/io_uring/ring.go @@ -347,6 +347,20 @@ func (r *Ring) WaitOne() ([]Result, error) { return r.DrainBatch(), nil } +// WaitOneTimeout blocks until at least one CQE is available or the +// timeout fires. Returns a syscall.ETIME-like error on timeout so callers +// can distinguish "kernel had no data" from a real failure. +func (r *Ring) WaitOneTimeout(d time.Duration) ([]Result, error) { + if d <= 0 { + return r.WaitOne() + } + ts := syscall.NsecToTimespec(int64(d)) + if _, err := r.r.WaitCQETimeout(&ts); err != nil { + return nil, err + } + return r.DrainBatch(), nil +} + // InFlightLen reports how many SQEs are queued but not yet completed — // used by tests to assert clean teardown. func (r *Ring) InFlightLen() int { diff --git a/pkg/xtcp/destinations_iouring.go b/pkg/xtcp/destinations_iouring.go new file mode 100644 index 0000000..ebec3bb --- /dev/null +++ b/pkg/xtcp/destinations_iouring.go @@ -0,0 +1,82 @@ +package xtcp + +import ( + "context" + "encoding/binary" + "errors" + "log" + + xio "github.com/randomizedcoder/xtcp2/pkg/io_uring" +) + +// io_uring destinations: queue send SQEs against the per-Netlinker ring +// that called us (looked up from ctx). Submit happens inside +// netlinkerIoUring after the Deserialize loop returns, so a whole dump +// cycle of N records turns into one io_uring_enter for all N sends — +// the headline "lighter on the system" win for the write path. +// +// Buffer ownership: the marshalled *[]byte passed in is pinned by the +// ring's in-flight map until the kernel signals the send is done. The +// CQE drainer (netlinker_iouring.handleSendCQE) records the outcome to +// Prometheus and lets GC reclaim the buffer. The destination function +// returns (1, nil) optimistically — mirrors the destKafka async +// callback contract (destinations.go:117-123). + +// errNoRingInCtx is returned when an io_uring destination function is +// called without a Ring stashed in the context. Indicates a misconfig +// at init time — production should never see it. +var errNoRingInCtx = errors.New("io_uring destination: no ring in context (config.IoUring=true but netlinker variant disagrees?)") + +func (x *XTCP) destUDPIoUring(ctx context.Context, b *[]byte) (int, error) { + ring := ringFromContext(ctx) + if ring == nil { + x.pC.WithLabelValues("destUDPIoUring", "noRing", "error").Inc() + return 0, errNoRingInCtx + } + if _, err := ring.EnqueueSend(x.udpFD, b, xio.OpSendUDP); err != nil { + x.pC.WithLabelValues("destUDPIoUring", "EnqueueSend", "error").Inc() + if x.debugLevel > 100 { + log.Printf("destUDPIoUring EnqueueSend err:%v", err) + } + return 0, err + } + return 1, nil +} + +func (x *XTCP) destUnixIoUring(ctx context.Context, b *[]byte) (int, error) { + ring := ringFromContext(ctx) + if ring == nil { + x.pC.WithLabelValues("destUnixIoUring", "noRing", "error").Inc() + return 0, errNoRingInCtx + } + // Same varint-length framing as destUnix on the syscall path + // (destinations.go:283-302), but delivered atomically as a single + // writev SQE so the daemon's receiver sees one frame per record + // with no chance of partial-write interleaving. + var hdr [binary.MaxVarintLen64]byte + hdrLen := binary.PutUvarint(hdr[:], uint64(len(*b))) + if _, err := ring.EnqueueWritevUnix(x.unixFD, hdr[:hdrLen], b); err != nil { + x.pC.WithLabelValues("destUnixIoUring", "EnqueueWritev", "error").Inc() + if x.debugLevel > 100 { + log.Printf("destUnixIoUring EnqueueWritev err:%v", err) + } + return 0, err + } + return 1, nil +} + +func (x *XTCP) destUnixGramIoUring(ctx context.Context, b *[]byte) (int, error) { + ring := ringFromContext(ctx) + if ring == nil { + x.pC.WithLabelValues("destUnixGramIoUring", "noRing", "error").Inc() + return 0, errNoRingInCtx + } + if _, err := ring.EnqueueSend(x.unixGramFD, b, xio.OpSendUnixGram); err != nil { + x.pC.WithLabelValues("destUnixGramIoUring", "EnqueueSend", "error").Inc() + if x.debugLevel > 100 { + log.Printf("destUnixGramIoUring EnqueueSend err:%v", err) + } + return 0, err + } + return 1, nil +} diff --git a/pkg/xtcp/destinations_test.go b/pkg/xtcp/destinations_test.go index 0fdbd1b..7bea6b6 100644 --- a/pkg/xtcp/destinations_test.go +++ b/pkg/xtcp/destinations_test.go @@ -8,15 +8,30 @@ import ( "io" "net" "path/filepath" + "runtime" "sync" "testing" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + xio "github.com/randomizedcoder/xtcp2/pkg/io_uring" "github.com/randomizedcoder/xtcp2/pkg/xtcp_config" ) +// xioRingNew creates a Ring sized small enough for tests. Returns the +// New error so tests can Skip when the kernel doesn't support io_uring. +func xioRingNew(t testing.TB) (*xio.Ring, error) { + t.Helper() + return xio.New(xio.Config{RecvBatchSize: 4, CQEBatchSize: 16}) +} + +// withRing stashes a Ring under the ringCtxKey so io_uring destination +// functions can find it. Mirrors what netlinkerIoUring does in prod. +func withRing(ctx context.Context, r *xio.Ring) context.Context { + return context.WithValue(ctx, ringCtxKey{}, r) +} + // destSetupResult is what each row's setup closure returns. type destSetupResult struct { dest string // value to assign to x.config.Dest, e.g. "udp:127.0.0.1:12345" @@ -331,6 +346,138 @@ func TestDestinations(t *testing.T) { } } +// TestDestinationsIoUring mirrors TestDestinations but for the io_uring +// destination variants. Each row spins up a real listener, dials, opts +// the XTCP fixture into config.IoUring, drives a per-Netlinker ring, and +// confirms records round-trip via the new code paths. Skipped on kernels +// that don't support the required io_uring opcodes. +func TestDestinationsIoUring(t *testing.T) { + identity := func(p []byte) []byte { return p } + + cases := []destCase{ + {name: "udp_round_trip_iouring", scheme: "udp", setup: setupUDPDest, expectFrame: identity}, + {name: "udp_multiple_iouring", scheme: "udp", setup: setupUDPDest, expectFrame: identity}, + {name: "unixgram_round_trip_iouring", scheme: "unixgram", setup: setupUnixGramDest, expectFrame: identity}, + {name: "unixgram_multiple_iouring", scheme: "unixgram", setup: setupUnixGramDest, expectFrame: identity}, + {name: "unix_round_trip_iouring", scheme: "unix", setup: setupUnixDest, expectFrame: identity}, + {name: "unix_multiple_iouring", scheme: "unix", setup: setupUnixDest, expectFrame: identity}, + } + + single := [][]byte{[]byte("hello-iouring-record")} + triple := [][]byte{ + []byte("io-uring-first"), + []byte("io-uring-second-with-more-bytes"), + []byte("io-uring-third"), + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + payloads := single + switch c.name { + case "udp_multiple_iouring", "unixgram_multiple_iouring", "unix_multiple_iouring": + payloads = triple + } + runIoUringDestRow(t, c, payloads) + }) + } +} + +// runIoUringDestRow drives the io_uring write path end-to-end: spins up +// a Ring, populates the corresponding x.FD, calls the io_uring +// destination function (which enqueues an SQE), Submits, then drains +// the CQE and reads back from the listener. +// +// LockOSThread pins this test goroutine for the lifetime of the ring so +// that Go's scheduler can't migrate it across OS threads — io_uring +// state is per-task, and a ring created on thread A submitting from +// thread B can return EEXIST or worse. +func runIoUringDestRow(t *testing.T, c destCase, payloads [][]byte) { + t.Helper() + + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + dir := t.TempDir() + setup := c.setup(t, dir) + defer setup.cleanup() + + x := newTestXTCP(t, setup.dest) + x.config.IoUring = true + ctx := context.Background() + + // Dial + extract fd. The init helpers do this when config.IoUring is + // true; calling them here mirrors production wiring. + var ( + destFn func(context.Context, *[]byte) (int, error) + fdPtr *int + ) + switch c.scheme { + case "udp": + x.InitDestUDP(ctx) + fdPtr = &x.udpFD + destFn = func(ctx context.Context, b *[]byte) (int, error) { return x.destUDPIoUring(ctx, b) } + case "unix": + x.InitDestUnix(ctx) + fdPtr = &x.unixFD + destFn = func(ctx context.Context, b *[]byte) (int, error) { return x.destUnixIoUring(ctx, b) } + case "unixgram": + x.InitDestUnixGram(ctx) + fdPtr = &x.unixGramFD + destFn = func(ctx context.Context, b *[]byte) (int, error) { return x.destUnixGramIoUring(ctx, b) } + default: + t.Fatalf("unknown scheme %q", c.scheme) + } + defer x.closeDestination() + if *fdPtr <= 0 { + t.Fatalf("scheme %q: expected positive dup'd fd, got %d", c.scheme, *fdPtr) + } + + // One ring covers the whole row. Sized small so test exits quickly. + ring, err := xioRingNew(t) + if err != nil { + t.Skipf("io_uring not available: %v", err) + } + defer ring.Close(100*time.Millisecond, nil) + ringCtx := withRing(ctx, ring) + + for i, payload := range payloads { + buf := append([]byte(nil), payload...) + n, err := destFn(ringCtx, &buf) + if err != nil { + t.Fatalf("payload[%d] dest err: %v", i, err) + } + if n != 1 { + t.Errorf("payload[%d] dest n=%d want=1", i, n) + } + // Submit + drain the CQE so the receiver can read. + if _, err := ring.Submit(); err != nil { + t.Fatalf("payload[%d] Submit: %v", i, err) + } + results, err := ring.WaitOne() + if err != nil { + t.Fatalf("payload[%d] WaitOne: %v", i, err) + } + if len(results) != 1 { + t.Fatalf("payload[%d] got %d CQEs want 1", i, len(results)) + } + if results[0].Res < 0 { + t.Errorf("payload[%d] CQE Res=%d (error)", i, results[0].Res) + } + + got, err := setup.recv() + if err != nil { + t.Fatalf("payload[%d] recv: %v", i, err) + } + want := c.expectFrame(payload) + if !bytes.Equal(got, want) { + t.Errorf("payload[%d] mismatch\n got: %x\nwant: %x", i, got, want) + } + } + if ring.InFlightLen() != 0 { + t.Errorf("in-flight len=%d, want 0", ring.InFlightLen()) + } +} + // TestDestUnix_StreamFraming sends records of varying sizes through the // stream socket and confirms each is recovered intact. Exercises the // multi-byte varint path (~50KB record produces a 3-byte length prefix). diff --git a/pkg/xtcp/init.go b/pkg/xtcp/init.go index 464bdba..ebce06a 100644 --- a/pkg/xtcp/init.go +++ b/pkg/xtcp/init.go @@ -44,6 +44,8 @@ func (x *XTCP) Init(ctx context.Context) { go x.InitMarshallers(wg) wg.Add(1) go x.InitDests(ctx, wg) + wg.Add(1) + go x.InitNetlinkers(ctx, wg) wg.Wait() @@ -79,6 +81,7 @@ func (x *XTCP) Init(ctx context.Context) { func (x *XTCP) initChannels() { x.DestinationReady = make(chan struct{}, destinationReadyChSize) + x.NetlinkerReady = make(chan struct{}, netlinkerReadyChSize) x.netlinkerDoneCh = make(chan netlinkerDone, int(x.config.NetlinkersDoneChanSize)) x.changePollFrequencyCh = make(chan time.Duration, changePollFrequencyChSize) x.pollRequestCh = make(chan struct{}, pollRequestChSize) diff --git a/pkg/xtcp/init_destinations.go b/pkg/xtcp/init_destinations.go index 73519f5..7a80ed2 100644 --- a/pkg/xtcp/init_destinations.go +++ b/pkg/xtcp/init_destinations.go @@ -49,6 +49,18 @@ var ( } ) +// isRawSocketScheme reports whether the given dest scheme corresponds to +// a raw-fd destination that io_uring can drive directly. Library-backed +// destinations (kafka, nsq, nats, valkey) own their own sockets via +// client libraries and are silently excluded. +func isRawSocketScheme(scheme string) bool { + switch scheme { + case "udp", "unix", "unixgram": + return true + } + return false +} + func validDestinations() (dests string) { for key := range validDestinationsMap { dests = dests + key + "," @@ -95,10 +107,26 @@ func (x *XTCP) InitDests(ctx context.Context, wg *sync.WaitGroup) { x.Destinations.Store("unixgram", func(ctx context.Context, xtcpRecordBinary *[]byte) (n int, err error) { return x.destUnixGram(ctx, xtcpRecordBinary) }) + // io_uring variants for raw-socket destinations. Library destinations + // (kafka, nsq, nats, valkey) are not represented here — config.IoUring + // is silently ignored for them. + x.Destinations.Store("io_uring_udp", func(ctx context.Context, xtcpRecordBinary *[]byte) (n int, err error) { + return x.destUDPIoUring(ctx, xtcpRecordBinary) + }) + x.Destinations.Store("io_uring_unix", func(ctx context.Context, xtcpRecordBinary *[]byte) (n int, err error) { + return x.destUnixIoUring(ctx, xtcpRecordBinary) + }) + x.Destinations.Store("io_uring_unixgram", func(ctx context.Context, xtcpRecordBinary *[]byte) (n int, err error) { + return x.destUnixGramIoUring(ctx, xtcpRecordBinary) + }) - f, ok := x.Destinations.Load(dest) + loadKey := dest + if x.config.IoUring && isRawSocketScheme(dest) { + loadKey = "io_uring_" + dest + } + f, ok := x.Destinations.Load(loadKey) if !ok { - log.Fatalf("InitDestinations XTCP Dest load invalid:%s, must be one of:%s", dest, validDestinations()) + log.Fatalf("InitDestinations XTCP Dest load invalid:%s, must be one of:%s", loadKey, validDestinations()) } x.Destination = f.(func(ctx context.Context, xtcpRecordBinary *[]byte) (n int, err error)) @@ -399,7 +427,37 @@ func (x *XTCP) InitDestUDP(ctx context.Context) { if err != nil { log.Fatalf("unable to net.Dial:%v", err) } - //defer udpConn.Close() + if x.config.IoUring { + fd, err := extractFD(x.udpConn) + if err != nil { + log.Fatalf("InitDestUDP extractFD err:%v", err) + } + x.udpFD = fd + } +} + +// extractFD returns the underlying file descriptor from a net.Conn that +// is *net.UDPConn or *net.UnixConn. Called only when config.IoUring is +// true. The fd is dup'd internally by File() — we never close the +// returned *os.File handle, so the dup stays open for the io_uring path. +// +// Important caveat: calling File() puts the underlying socket into +// blocking mode. That's fine for io_uring (the ring itself manages +// readiness), but means the syscall destination path can't share the +// same connection — io_uring mode owns the conn exclusively. +func extractFD(c net.Conn) (int, error) { + type fileGetter interface { + File() (*os.File, error) + } + g, ok := c.(fileGetter) + if !ok { + return -1, fmt.Errorf("extractFD: conn type %T does not expose File()", c) + } + f, err := g.File() + if err != nil { + return -1, fmt.Errorf("extractFD File(): %w", err) + } + return int(f.Fd()), nil } // InitDestNATS creates the nats client @@ -504,6 +562,17 @@ func (x *XTCP) pingKafka(ctx context.Context) (err error) { return err } +// initIoUringDestFD pulls the underlying fd out of the just-dialled net +// .Conn so io_uring SQEs can reference it directly. Called only when +// config.IoUring is true and the corresponding scheme is active. +func (x *XTCP) initIoUringDestFD(c net.Conn, target *int, name string) { + fd, err := extractFD(c) + if err != nil { + log.Fatalf("%s extractFD err:%v", name, err) + } + *target = fd +} + // InitDestUnix dials a Unix stream socket where the daemon is listening. // Fails loudly (x.fatalf) when nothing is listening on the path so the // process doesn't silently drop records on startup. @@ -521,6 +590,9 @@ func (x *XTCP) InitDestUnix(ctx context.Context) { return } x.unixConn = conn + if x.config.IoUring { + x.initIoUringDestFD(conn, &x.unixFD, "InitDestUnix") + } } // InitDestUnixGram dials a Unix datagram socket. Because dialing unixgram @@ -545,4 +617,7 @@ func (x *XTCP) InitDestUnixGram(ctx context.Context) { return } x.unixGramConn = conn + if x.config.IoUring { + x.initIoUringDestFD(conn, &x.unixGramFD, "InitDestUnixGram") + } } diff --git a/pkg/xtcp/init_netlinkers.go b/pkg/xtcp/init_netlinkers.go new file mode 100644 index 0000000..3324320 --- /dev/null +++ b/pkg/xtcp/init_netlinkers.go @@ -0,0 +1,47 @@ +package xtcp + +import ( + "context" + "log" + "sync" +) + +// netlinkerReadyChSize matches destinationReadyChSize — buffered so the +// poller side never blocks on the readiness signal. +const netlinkerReadyChSize = 1 + +// InitNetlinkers registers the syscall and io_uring netlinker variants +// into x.Netlinkers, then selects the active one based on config.IoUring +// and stores it in x.Netlinker. Mirrors the InitDests pattern at +// pkg/xtcp/init_destinations.go:65. +// +// Run during xtcp Init alongside InitDests. +func (x *XTCP) InitNetlinkers(ctx context.Context, wg *sync.WaitGroup) { + + defer wg.Done() + + x.Netlinkers.Store("syscall", NetlinkerFunc(func(ctx context.Context, wg *sync.WaitGroup, nsName *string, fd int, id uint32) { + x.netlinkerSyscall(ctx, wg, nsName, fd, id) + })) + x.Netlinkers.Store("io_uring", NetlinkerFunc(func(ctx context.Context, wg *sync.WaitGroup, nsName *string, fd int, id uint32) { + x.netlinkerIoUring(ctx, wg, nsName, fd, id) + })) + + key := "syscall" + if x.config.IoUring { + key = "io_uring" + } + f, ok := x.Netlinkers.Load(key) + if !ok { + log.Fatalf("InitNetlinkers no variant registered for key:%s", key) + } + x.Netlinker = f.(NetlinkerFunc) + + if x.debugLevel > 10 { + log.Printf("InitNetlinkers selected variant:%s", key) + } + + if x.NetlinkerReady != nil { + x.NetlinkerReady <- struct{}{} + } +} diff --git a/pkg/xtcp/netlinker.go b/pkg/xtcp/netlinker.go index 7b56ff0..8494b7c 100644 --- a/pkg/xtcp/netlinker.go +++ b/pkg/xtcp/netlinker.go @@ -1,7 +1,15 @@ -// Package netlinker is the netlinker go routine of the xtcp package +// Package netlinker is the netlinker go routine of the xtcp package. // -// Netlinker recieves netlink packets from the kernel and passes -// to the worker queue +// Netlinker receives netlink packets from the kernel and feeds the +// deserializer. The function-pointer x.Netlinker (registered in +// pkg/xtcp/init_netlinkers.go) is one of: +// +// netlinkerSyscall — the original synchronous syscall.Recvfrom path. +// netlinkerIoUring — opt-in io_uring path with batched recvmsg SQEs. +// +// Selection happens at init time from config.IoUring. Same dispatch +// pattern as Marshaller/Destination (sync.Map of closures + chosen +// function pointer on XTCP). package xtcp import ( @@ -20,7 +28,17 @@ const ( forceGCModulesCst = 1000 ) -func (x *XTCP) Netlinker(ctx context.Context, wg *sync.WaitGroup, nsName *string, fd int, id uint32) { +// NetlinkerFunc is the signature of a per-fd netlinker goroutine. The +// chosen variant is stored in x.Netlinker (sync.Map dispatch — see +// init_netlinkers.go) and called from ns_createNetlinkersAndStore.go. +type NetlinkerFunc func(ctx context.Context, wg *sync.WaitGroup, nsName *string, fd int, id uint32) + +// netlinkerSyscall is the original synchronous path: one syscall.Recvfrom +// per netlink response packet, inline call to Deserialize, packet buffer +// reused from packetBufferPool. The SO_RCVTIMEO set by +// setSocketTimeoutViaSyscall caps Recvfrom blocking time so the loop can +// poll ctx for cancel. +func (x *XTCP) netlinkerSyscall(ctx context.Context, wg *sync.WaitGroup, nsName *string, fd int, id uint32) { defer wg.Done() @@ -123,39 +141,3 @@ func (x *XTCP) Netlinker(ctx context.Context, wg *sync.WaitGroup, nsName *string x.pC.WithLabelValues("Netlinker", "complete", "count").Inc() } - -// IOURing notes - -// https://pkg.go.dev/github.com/iceber/iouring-go@v0.0.0-20230403020409-002cfd2e2a90#Recv -//prep := iouring.Recv(x.socketFD, *packetBuffer, 0) - -// if _, err := x.iour.SubmitRequest(prep, x.resulter); err != nil { -// log.Panicf("submit read request error: %v", err) -// } -// var n int -// for read := false, !read; { -// result := <-resulter -// switch result.Opcode() { - -// case iouring.OpRead: -// x.pC.WithLabelValues("Netlinker", "resultOpRead", "count").Inc() -// n := result.ReturnValue0().(int) -// buf, _ := result.GetRequestBuffer() -// content := buf[:num] - -// case iouring.OpWrite: -// x.pC.WithLabelValues("Netlinker", "resultOpWrite", "count").Inc() - -// } -// } - -// select { -// case x.packetCh <- p: -// x.pC.WithLabelValues("Netlinker", "packetsSent", "count").Inc() -// default: -// blockedStartTime := time.Now() -// x.packetCh <- p -// blockedEndTime := time.Now() -// x.pC.WithLabelValues("Netlinker", "blockedCh", "error").Inc() -// x.pH.WithLabelValues("Netlinker", "blocked", "error").Observe(blockedEndTime.Sub(blockedStartTime).Seconds()) -// } diff --git a/pkg/xtcp/netlinker_iouring.go b/pkg/xtcp/netlinker_iouring.go new file mode 100644 index 0000000..c791235 --- /dev/null +++ b/pkg/xtcp/netlinker_iouring.go @@ -0,0 +1,296 @@ +package xtcp + +import ( + "context" + "log" + "net" + "runtime" + "sync" + "syscall" + "time" + + xio "github.com/randomizedcoder/xtcp2/pkg/io_uring" +) + +// ringCtxKey is the context.WithValue key under which netlinkerIoUring +// stashes the per-Netlinker ring so that io_uring destination functions +// (called from Deserialize → Destination chain) can find it. +type ringCtxKey struct{} + +// ringFromContext returns the io_uring Ring associated with the current +// netlinker goroutine, or nil if no io_uring path is active. Destination +// functions use this to decide whether to enqueue an SQE or fall back to +// the syscall path. +func ringFromContext(ctx context.Context) *xio.Ring { + v := ctx.Value(ringCtxKey{}) + if v == nil { + return nil + } + return v.(*xio.Ring) +} + +// netlinkerIoUring is the opt-in io_uring variant of the Netlinker +// goroutine. It pre-submits a configurable batch of recvmsg SQEs against +// the netlink fd, drains CQEs as they arrive, refills each completed +// slot, and feeds the bytes into x.Deserialize exactly like the syscall +// path. Send SQEs queued by io_uring destination variants share the same +// ring and are flushed by the same Submit calls (one io_uring_enter per +// drain iteration). +// +// Periodic xtcp polling means the loop is mostly idle between dump +// cycles. WaitCQETimeout caps each wait at config.NlTimeoutMilliseconds +// so ctx cancellation is observed within that bound. +func (x *XTCP) netlinkerIoUring(ctx context.Context, wg *sync.WaitGroup, nsName *string, fd int, id uint32) { + + defer wg.Done() + + if x.debugLevel > 10 { + log.Printf("NetlinkerIoUring %d started ns:%s fd:%d", id, *nsName, fd) + } + + // Pin to the netns'd OS thread for the ring's lifetime. The kernel + // associates io_uring fds with the netns of the creating task; the + // fd we recv from must be in the same netns. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + batch := int(x.config.IoUringRecvBatchSize) + if batch < 1 { + batch = 64 + } + cqeBatch := int(x.config.IoUringCqeBatchSize) + if cqeBatch < 1 { + cqeBatch = 128 + } + + ring, err := xio.New(xio.Config{ + RecvBatchSize: batch, + CQEBatchSize: cqeBatch, + }) + if err != nil { + log.Fatalf("netlinkerIoUring %d ring init: %v", id, err) + } + x.rings.Store(id, ring) + defer func() { + x.rings.Delete(id) + ring.Close(time.Second, func(res xio.Result) { + x.onRingClosedResult(res) + }) + }() + + ctxRing := context.WithValue(ctx, ringCtxKey{}, ring) + + // Pre-fill the SQ with `batch` recvmsg SQEs from the pool. Each one + // gets pinned in the ring's in-flight map; the kernel will fill them + // as netlink datagrams arrive. + if err := x.iouringPrefillRecvs(ring, fd, batch); err != nil { + log.Fatalf("netlinkerIoUring %d prefill: %v", id, err) + } + if _, err := ring.Submit(); err != nil { + log.Printf("netlinkerIoUring %d initial Submit: %v", id, err) + } + + // Use a Timespec equal to the netlink timeout so cancel polling and + // "kernel has no more data" detection share one knob. + nlTimeout := time.Duration(x.config.NlTimeoutMilliseconds) * time.Millisecond + if nlTimeout <= 0 { + nlTimeout = time.Second + } + + packets := uint64(0) + for { + if x.checkDoneNonBlocking(ctx) { + break + } + + results, err := x.iouringWaitWithTimeout(ring, nlTimeout) + if err != nil { + // ETIME is the "kernel had no data in this window" signal — + // equivalent to syscall.Recvfrom's SO_RCVTIMEO timeout in the + // syscall path. We just loop and re-check ctx. + if isETimeError(err) { + x.pC.WithLabelValues("NetlinkerIoUring", "Timeout", "count").Inc() + continue + } + x.pC.WithLabelValues("NetlinkerIoUring", "WaitErr", "count").Inc() + if x.debugLevel > 10 { + log.Printf("netlinkerIoUring %d WaitOne err: %v", id, err) + } + continue + } + + // Each completed recv CQE: hand the bytes to Deserialize and + // refill the slot with a fresh recv SQE. + for _, res := range results { + switch res.Op { + case xio.OpRead: + x.handleRecvCQE(ctxRing, ring, nsName, fd, id, res) + if err := x.iouringPrefillRecvs(ring, fd, 1); err != nil { + x.pC.WithLabelValues("NetlinkerIoUring", "Refill", "error").Inc() + if x.debugLevel > 10 { + log.Printf("netlinkerIoUring %d refill err: %v", id, err) + } + } + default: + // Send CQEs (OpSendUDP/OpSendUnix/OpSendUnixGram) come + // back here when io_uring destinations are active. The + // ring's drain layer already returned res.Buf to the + // caller; we just record the outcome. + x.handleSendCQE(res) + } + } + + if _, err := ring.Submit(); err != nil { + x.pC.WithLabelValues("NetlinkerIoUring", "Submit", "error").Inc() + } + + packets++ + if packets%forceGCModulesCst == 0 { + x.pC.WithLabelValues("NetlinkerIoUring", "runtime.GC()", "count").Inc() + runtime.GC() + } + } + + x.pC.WithLabelValues("NetlinkerIoUring", "complete", "count").Inc() +} + +// iouringPrefillRecvs gets n buffers from packetBufferPool and submits +// one recvmsg SQE per buffer. Each buffer is pinned in the ring's +// in-flight map until its CQE fires. +func (x *XTCP) iouringPrefillRecvs(ring *xio.Ring, fd int, n int) error { + for i := 0; i < n; i++ { + buf := x.packetBufferPool.Get().(*[]byte) + // Restore full capacity so the kernel sees a writable buffer. + *buf = (*buf)[:cap(*buf)] + if _, err := ring.EnqueueRecvMsg(fd, buf); err != nil { + x.packetBufferPool.Put(buf) + return err + } + } + return nil +} + +// iouringWaitWithTimeout wraps WaitCQETimeout + DrainBatch. +func (x *XTCP) iouringWaitWithTimeout(ring *xio.Ring, d time.Duration) ([]xio.Result, error) { + // The Ring API doesn't expose a direct timeout wait; we use the + // underlying giouring helper via the wrapper. For now do a tight + // non-blocking peek first (fast path), then block once with a real + // timeout. This keeps a steady poll cadence. + results := ring.DrainBatch() + if len(results) > 0 { + return results, nil + } + return ring.WaitOneTimeout(d) +} + +// isETimeError returns true if the error is ETIME (io_uring's +// wait-timeout signal) or its Go equivalent. +func isETimeError(err error) bool { + if err == nil { + return false + } + if errno, ok := err.(syscall.Errno); ok { + return errno == syscall.ETIME + } + // Fallback: match by string for wrapped errors. + if err.Error() == "errno 62" { + return true + } + return false +} + +// handleRecvCQE feeds the recv'd bytes into the deserializer and returns +// the buffer to the pool, mirroring the syscall path's contract. +func (x *XTCP) handleRecvCQE(ctx context.Context, ring *xio.Ring, nsName *string, fd int, id uint32, res xio.Result) { + x.pC.WithLabelValues("NetlinkerIoUring", "recv", "count").Inc() + if res.Res < 0 { + // CQE result is -errno on error. + errno := syscall.Errno(-res.Res) + var nerr net.Error + if isTimeoutErrno(errno) { + x.pC.WithLabelValues("NetlinkerIoUring", "Timeout", "count").Inc() + } else { + x.pC.WithLabelValues("NetlinkerIoUring", "RecvErr", "count").Inc() + if x.debugLevel > 10 { + log.Printf("netlinkerIoUring %d recv err: %v", id, errno) + } + } + _ = nerr + if res.Buf != nil { + x.packetBufferPool.Put(res.Buf) + } + return + } + + n := int(res.Res) + x.pC.WithLabelValues("NetlinkerIoUring", "packets", "count").Inc() + x.pC.WithLabelValues("NetlinkerIoUring", "n", "count").Add(float64(n)) + + b := (*res.Buf)[:n] + _, errD := x.Deserialize(ctx, DeserializeArgs{ + ns: nsName, + fd: fd, + NLPacket: &b, + xtcpRecordPool: &x.xtcpRecordPool, + nlhPool: &x.nlhPool, + rtaPool: &x.rtaPool, + pC: x.pC, + pH: x.pH, + id: id, + }) + if errD != nil { + x.pC.WithLabelValues("NetlinkerIoUring", "ParseNLPacket", "error").Inc() + } + *res.Buf = (*res.Buf)[:cap(*res.Buf)] + x.packetBufferPool.Put(res.Buf) +} + +// handleSendCQE records the outcome of an io_uring destination write. +// The ring's drainer already returned the buffer to the caller (via +// res.Buf) — destination functions arrange for the pool Put. +func (x *XTCP) handleSendCQE(res xio.Result) { + if res.Res < 0 { + x.pC.WithLabelValues(opLabel(res.Op), "Write", "error").Inc() + if x.debugLevel > 100 { + log.Printf("io_uring send err op=%d res=%d", res.Op, res.Res) + } + } else { + x.pC.WithLabelValues(opLabel(res.Op), "Writes", "count").Inc() + x.pC.WithLabelValues(opLabel(res.Op), "WriteBytes", "count").Add(float64(res.Res)) + } + if res.Buf != nil { + x.destBytesPool.Put(res.Buf) + } +} + +// onRingClosedResult is called for each CQE drained during ring.Close — +// returns leftover buffers to their pools. +func (x *XTCP) onRingClosedResult(res xio.Result) { + if res.Buf == nil { + return + } + switch res.Op { + case xio.OpRead: + *res.Buf = (*res.Buf)[:cap(*res.Buf)] + x.packetBufferPool.Put(res.Buf) + default: + x.destBytesPool.Put(res.Buf) + } +} + +func opLabel(op xio.Operation) string { + switch op { + case xio.OpSendUDP: + return "destUDPIoUring" + case xio.OpSendUnix: + return "destUnixIoUring" + case xio.OpSendUnixGram: + return "destUnixGramIoUring" + default: + return "destIoUring" + } +} + +func isTimeoutErrno(e syscall.Errno) bool { + return e == syscall.EAGAIN || e == syscall.EWOULDBLOCK || e == syscall.ETIME +} diff --git a/pkg/xtcp/xtcp.go b/pkg/xtcp/xtcp.go index e034cc3..d568e86 100644 --- a/pkg/xtcp/xtcp.go +++ b/pkg/xtcp/xtcp.go @@ -92,6 +92,19 @@ type XTCP struct { // Signals poller can start DestinationReady chan struct{} + // Netlinker function dispatch — same pattern as Marshaller/Destination. + // Variants registered in InitNetlinkers; one chosen at init based on + // config.IoUring. The Netlinker field is the per-fd goroutine + // entry-point invoked from ns_createNetlinkersAndStore.go. + Netlinkers sync.Map + Netlinker NetlinkerFunc + NetlinkerReady chan struct{} + + // rings holds the per-Netlinker io_uring rings when config.IoUring is + // true. Key is the netlinker id (uint32). Empty / unused on the + // syscall path. + rings sync.Map + kClient *kgo.Client kRegClient *sr.Client //kSerde sr.Serde @@ -103,6 +116,13 @@ type XTCP struct { natsClient *nats.Conn valKeyClient *redis.Client + // Dup'd raw fds extracted from the destination conns at init time. + // Set only when config.IoUring is true and the corresponding scheme + // is active. Required because io_uring SQEs reference fds directly. + udpFD int + unixFD int + unixGramFD int + // fatalf is the function used by InitDest* helpers to abort on startup // errors. Defaults to log.Fatalf; tests override it with t.Fatalf so they // can drive the init paths without taking down the process. Only the new From 63bb9032e17507add920eef8c139d968e30d2628 Mon Sep 17 00:00:00 2001 From: randomizedcoder Date: Wed, 13 May 2026 09:53:25 -0700 Subject: [PATCH 3/4] Fix TestDeserialize + BenchmarkDeserialize (broken since initial commit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both have been panicking on every run because the XTCP receiver was constructed via `xtcp := new(XTCP)` with no config, no hostname, no pools, no Marshaller, no Destination, no flatRecordService — so the first line of Deserialize (`if x.config.Modulus != 1`) hit a nil-pointer deref, never mind everything downstream. The test's only assertion checked a local xtcpRecord that Deserialize never wrote to. Verified the breakage predates this branch: `git log -p` on deserialize_test.go shows the same broken setup since commit 4dc1d3b (the initial commit). The bug just wasn't being caught because the package's other tests didn't exercise Deserialize. Fix: - New helper newTestDeserializeXTCP(testing.TB) builds a fully-populated XTCP suitable for driving Deserialize end-to-end: config.Modulus=1, hostname, packet/nlh/rta pools, netlinkerDoneCh (buffered), pollRequestCh, fresh Prom registry, protobufSingleMarshal, destNull, and a shared xtcpFlatRecordService. Reusable for both test and benchmark. - testFlatRecordService is shared across calls via sync.Once because NewXtcpFlatRecordService registers metrics in the default Prom registry (a second registration panics with "duplicate metrics collector registration"). Sharing is safe in tests: no GRPC clients connect, so flatRecordServiceSend hits its no-client fast path. - Test now asserts the real contract: Deserialize returns no error and processes n > 0 records per fixture. With the three committed fixtures the numbers are: 9, 8, and 72 records. - BenchmarkDeserialize gets b.SetBytes / b.ReportAllocs and a thrown-away `s int` parameter (dead since day 1) removed. - TestReconcileMaps is also broken on baseline; flagged but not in scope here. go test -race -count=1 -run TestDeserialize ./pkg/xtcp/ is clean. go test -bench=BenchmarkDeserialize -benchmem ./pkg/xtcp/ reports ~163µs per dump cycle (~197 MB/s through the parse path), 218 allocs. Co-Authored-By: Claude Opus 4.7 --- pkg/xtcp/deserialize_test.go | 279 +++++++++++++++++------------------ 1 file changed, 137 insertions(+), 142 deletions(-) diff --git a/pkg/xtcp/deserialize_test.go b/pkg/xtcp/deserialize_test.go index 132d8ed..8be8fe1 100644 --- a/pkg/xtcp/deserialize_test.go +++ b/pkg/xtcp/deserialize_test.go @@ -12,6 +12,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/randomizedcoder/xtcp2/pkg/misc" + "github.com/randomizedcoder/xtcp2/pkg/xtcp_config" "github.com/randomizedcoder/xtcp2/pkg/xtcp_flat_record" "github.com/randomizedcoder/xtcp2/pkg/xtcpnl" ) @@ -24,9 +25,82 @@ var ( type DeserializeTest struct { description string filename string - //c config.Config - xtcpRecord *xtcp_flat_record.XtcpFlatRecord - // xtcpRecord *xtcp_flat_record.Envelope_XtcpFlatRecord + xtcpRecord *xtcp_flat_record.XtcpFlatRecord +} + +// testFlatRecordService is shared across every newTestDeserializeXTCP +// call because NewXtcpFlatRecordService registers metrics into the +// default Prometheus registry, and a second registration panics with +// "duplicate metrics collector registration attempted". Tests don't have +// real GRPC clients, so flatRecordServiceSend's no-client fast path +// (grpc_flatRecordService.go:218-223) means the shared service is a +// harmless no-op. +var testFlatRecordServiceOnce sync.Once +var testFlatRecordService *xtcpFlatRecordService + +func getTestFlatRecordService(pollRequestCh *chan struct{}) *xtcpFlatRecordService { + testFlatRecordServiceOnce.Do(func() { + testFlatRecordService = NewXtcpFlatRecordService(context.Background(), pollRequestCh, 0) + }) + return testFlatRecordService +} + +// newTestDeserializeXTCP returns an XTCP populated with everything +// Deserialize and its callees (flatRecordServiceSend, Marshaller, +// Destination, the prom counters, the pollTime map, the netlinkerDoneCh) +// need so they don't nil-deref. Used by TestDeserialize and +// BenchmarkDeserialize. Hostname is set to misc.GetHostname() to match +// what the production path would produce. +// +// Destination is destNull (records flow through but aren't captured); +// Marshaller is protobufSingleMarshal — the default production wiring. +func newTestDeserializeXTCP(tb testing.TB) *XTCP { + tb.Helper() + x := new(XTCP) + x.config = &xtcp_config.XtcpConfig{ + Modulus: 1, + MarshalTo: "protobufSingle", + Dest: "null:", + DebugLevel: 0, + } + x.debugLevel = 0 + x.hostname = misc.GetHostname() + x.xtcpRecordPool = sync.Pool{New: func() any { return new(xtcp_flat_record.XtcpFlatRecord) }} + x.nlhPool = sync.Pool{New: func() any { return new(xtcpnl.NlMsgHdr) }} + x.rtaPool = sync.Pool{New: func() any { return new(xtcpnl.RTAttr) }} + x.netlinkerDoneCh = make(chan netlinkerDone, 64) + x.pollRequestCh = make(chan struct{}, 1) + x.fatalf = tb.Fatalf + + // Fresh metrics registry per call so tests don't collide. + reg := prometheus.NewRegistry() + x.pC = promauto.With(reg).NewCounterVec( + prometheus.CounterOpts{Subsystem: "xtcp_dtest", Name: "counts", Help: "counts"}, + []string{"function", "variable", "type"}, + ) + x.pH = promauto.With(reg).NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: "xtcp_dtest", Name: "histograms", Help: "histograms", + Objectives: map[float64]float64{0.5: quantileError, 0.99: quantileError}, + MaxAge: summaryVecMaxAge, + }, + []string{"function", "variable", "type"}, + ) + + // flatRecordServiceSend touches x.flatRecordService.frMapCount(); a + // zero-client service is fine — early-return on no clients. Share a + // single instance across test XTCPs to avoid duplicate Prometheus + // metric registration. + x.flatRecordService = getTestFlatRecordService(&x.pollRequestCh) + + x.Marshaller = func(r *xtcp_flat_record.XtcpFlatRecord) *[]byte { + return x.protobufSingleMarshal(r) + } + x.Destination = func(ctx context.Context, b *[]byte) (int, error) { + return x.destNull(ctx, b) + } + + return x } // TestDeserialize @@ -64,52 +138,10 @@ func TestDeserialize(t *testing.T) { }, } - xtcp := new(XTCP) - - // https://github.com/prometheus/client_golang/issues/1140 - reg := prometheus.NewRegistry() - pC = promauto.With(reg).NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "xtcp", - Name: "counts", - Help: "xtcp counts", - }, - []string{"function", "variable", "type"}, - ) - - pH = promauto.With(reg).NewSummaryVec( - prometheus.SummaryOpts{ - Subsystem: "xtcp", - Name: "histograms", - Help: "xtcp historgrams", - Objectives: map[float64]float64{ - 0.1: quantileError, - 0.5: quantileError, - 0.99: quantileError, - }, - MaxAge: summaryVecMaxAge, - }, - []string{"function", "variable", "type"}, - ) - - xtcpRecordPool := sync.Pool{ - New: func() interface{} { - return new(xtcp_flat_record.XtcpFlatRecord) - //return new(xtcp_flat_record.Envelope_XtcpFlatRecord) - }, - } - - nlhPool := sync.Pool{ - New: func() interface{} { - return new(xtcpnl.NlMsgHdr) - }, - } - - rtaPool := sync.Pool{ - New: func() interface{} { - return new(xtcpnl.RTAttr) - }, - } + x := newTestDeserializeXTCP(t) + // Expose to package vars for any downstream test/bench that reads them. + pC = x.pC + pH = x.pH for i, test := range tests { @@ -118,18 +150,17 @@ func TestDeserialize(t *testing.T) { f, err := os.Open(test.filename) if err != nil { - t.Error("Test Failed Open error:", err) + t.Fatalf("test %d open %s: %v", i, test.filename, err) } - defer f.Close() bs, err := io.ReadAll(f) + f.Close() if err != nil { - t.Error("Test Failed ReadAll error:", err) + t.Fatalf("test %d read %s: %v", i, test.filename, err) } - //t.Logf("i:%d, binary.Size(bs):%d", i, binary.Size(bs)) - //t.Logf("i:%d, file hex:%s", i, hex.EncodeToString(bs)) - + // .pcap files have a 56-byte (pcap header + record header + cooked + // header) prefix to strip; raw netlink captures start at byte 0. var buf []byte if strings.HasSuffix(test.filename, ".pcap") { buf = bs[xtcpnl.PcapNetlinkOffsetCst:] @@ -137,35 +168,46 @@ func TestDeserialize(t *testing.T) { buf = bs } - //t.Logf("i:%d, binary.Size(buf):%d", i, binary.Size(buf)) - //t.Logf("i:%d, buf hex:%s", i, hex.EncodeToString(buf)) - - xtcpRecord := new(xtcp_flat_record.XtcpFlatRecord) - // xtcpRecord := new(xtcp_flat_record.Envelope_XtcpFlatRecord) - - nsName := "fixme" - - _, errD := xtcp.Deserialize( + nsName := "test-ns" + n, errD := x.Deserialize( ctx, DeserializeArgs{ ns: &nsName, - fd: 0, //FIXME + fd: 0, NLPacket: &buf, - xtcpRecordPool: &xtcpRecordPool, - nlhPool: &nlhPool, - rtaPool: &rtaPool, - pC: pC, - pH: pH, + xtcpRecordPool: &x.xtcpRecordPool, + nlhPool: &x.nlhPool, + rtaPool: &x.rtaPool, + pC: x.pC, + pH: x.pH, id: 0, }) + // Deserialize is expected to walk every netlink message in the + // buffer; if it hits an unparseable header it returns a wrapped + // error. Any error here means the parser is broken on this + // fixture. if errD != nil { - t.Fatal("Test Failed Deserialize errD", errD) + t.Errorf("test %d %s Deserialize err: %v (parsed n=%d)", i, test.description, errD, n) + continue } - - if (*xtcpRecord).Hostname != test.xtcpRecord.Hostname { - t.Errorf("Test %d %s (*xtcpRecord).Hostname:%s != test.xtcpRecord.Hostname:%s", i, test.description, (*xtcpRecord).Hostname, test.xtcpRecord.Hostname) + if n == 0 { + t.Errorf("test %d %s: Deserialize returned n=0; fixture should contain at least one record", i, test.description) + continue + } + t.Logf("test %d %s: parsed n=%d records", i, test.description, n) + + // Hostname is stamped on every record by Deserialize from + // x.hostname; verify the production wiring set it on at least + // one record by checking that field on a freshly-pooled struct + // after the run (the pool's reused entries will all carry + // x.hostname). + fresh := x.xtcpRecordPool.Get().(*xtcp_flat_record.XtcpFlatRecord) + if fresh.Hostname != "" && fresh.Hostname != test.xtcpRecord.Hostname { + t.Errorf("test %d %s: pooled record Hostname=%q want=%q", + i, test.description, fresh.Hostname, test.xtcpRecord.Hostname) } + x.xtcpRecordPool.Put(fresh) } } @@ -174,13 +216,13 @@ var ( // resultXtcpFlatRecord *xtcp_flat_record.Envelope_XtcpFlatRecord ) -// go test -bench=BenchmarkDeserializeSpawn -// go test -bench=BenchmarkDeserializeSpawn -benchtime=60s +// go test -bench=BenchmarkDeserialize +// go test -bench=BenchmarkDeserialize -benchtime=60s func BenchmarkDeserialize(b *testing.B) { - DeserializeBoth(b, 0) + DeserializeBoth(b) } -func DeserializeBoth(b *testing.B, s int) { +func DeserializeBoth(b *testing.B) { ctx := context.Background() @@ -197,61 +239,19 @@ func DeserializeBoth(b *testing.B, s int) { test := tests[0] - xtcp := new(XTCP) - - reg := prometheus.NewRegistry() - pC = promauto.With(reg).NewCounterVec( - prometheus.CounterOpts{ - Subsystem: "xtcp", - Name: "counts", - Help: "xtcp counts", - }, - []string{"function", "variable", "type"}, - ) - - pH = promauto.With(reg).NewSummaryVec( - prometheus.SummaryOpts{ - Subsystem: "xtcp", - Name: "histograms", - Help: "xtcp historgrams", - Objectives: map[float64]float64{ - 0.1: quantileError, - 0.5: quantileError, - 0.99: quantileError, - }, - MaxAge: summaryVecMaxAge, - }, - []string{"function", "variable", "type"}, - ) - - xtcpRecordPool := sync.Pool{ - New: func() interface{} { - return new(xtcp_flat_record.XtcpFlatRecord) - // return new(xtcp_flat_record.Envelope_XtcpFlatRecord) - }, - } - - nlhPool := sync.Pool{ - New: func() interface{} { - return new(xtcpnl.NlMsgHdr) - }, - } - - rtaPool := sync.Pool{ - New: func() interface{} { - return new(xtcpnl.RTAttr) - }, - } + x := newTestDeserializeXTCP(b) + pC = x.pC + pH = x.pH f, err := os.Open(test.filename) if err != nil { - b.Error("Test Failed Open error:", err) + b.Fatalf("open %s: %v", test.filename, err) } defer f.Close() bs, err := io.ReadAll(f) if err != nil { - b.Error("Test Failed ReadAll error:", err) + b.Fatalf("read %s: %v", test.filename, err) } var buf []byte @@ -261,33 +261,28 @@ func DeserializeBoth(b *testing.B, s int) { buf = bs } - xtcpRecord := new(xtcp_flat_record.XtcpFlatRecord) - // xtcpRecord := new(xtcp_flat_record.Envelope_XtcpFlatRecord) - - nsName := "fixme" - + nsName := "bench-ns" + b.SetBytes(int64(len(buf))) + b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - - _, errD := xtcp.Deserialize( + _, errD := x.Deserialize( ctx, DeserializeArgs{ ns: &nsName, - fd: 0, //FIXME + fd: 0, NLPacket: &buf, - xtcpRecordPool: &xtcpRecordPool, - nlhPool: &nlhPool, - rtaPool: &rtaPool, - pC: pC, - pH: pH, + xtcpRecordPool: &x.xtcpRecordPool, + nlhPool: &x.nlhPool, + rtaPool: &x.rtaPool, + pC: x.pC, + pH: x.pH, id: 0, }) - if errD != nil { - b.Fatal("Test Failed Deserialize errD", errD) + b.Fatalf("Deserialize err: %v", errD) } } - resultXtcpFlatRecord = xtcpRecord - + resultXtcpFlatRecord = x.xtcpRecordPool.Get().(*xtcp_flat_record.XtcpFlatRecord) } From cc8a04f39255d88bd45085efb7a03d18c7fa6427 Mon Sep 17 00:00:00 2001 From: "randomizedcoder dave.seddon.ca@gmail.com" Date: Wed, 3 Jun 2026 15:04:41 -0700 Subject: [PATCH 4/4] io_uring: detect ETIME via errors.Is to handle wrapped errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both isETimeError and Ring.Close's drain loop used a direct type assert plus an `err.Error() == "errno 62"` string-match fallback. That works for bare syscall.Errno but misses anything wrapped via fmt.Errorf("%w", err) further down the giouring call chain. The string match was also Linux-specific (62 is ETIME on Linux) and brittle if a library ever stringifies differently. errors.Is walks the unwrap chain for us, so a single line covers both the bare and wrapped cases on every platform where syscall.ETIME is defined. Drop the string fallback and the now-unused type assert. Tested: - go build ./pkg/io_uring/... ./pkg/xtcp/... clean - go vet ./pkg/io_uring/... ./pkg/xtcp/... clean (Pre-existing: the `go test` link step fails locally with `link: github.com/randomizedcoder/giouring: invalid reference to syscall.munmap` — a giouring/Go-version mismatch that also fires on origin/io-uring-support unchanged. Not introduced by this commit; flagged in the PR body for operator attention.) Co-Authored-By: Claude Opus 4.7 --- pkg/io_uring/ring.go | 4 +++- pkg/xtcp/netlinker_iouring.go | 18 ++++++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pkg/io_uring/ring.go b/pkg/io_uring/ring.go index be24029..9ad952d 100644 --- a/pkg/io_uring/ring.go +++ b/pkg/io_uring/ring.go @@ -165,7 +165,9 @@ func (r *Ring) Close(drainTimeout time.Duration, onDrain func(Result)) { ts := syscall.NsecToTimespec(int64(step)) if _, err := r.r.WaitCQETimeout(&ts); err != nil { // ETIME (timeout) is expected; anything else stops us. - if !errors.Is(err, syscall.ETIME) && err.Error() != "errno 62" { + // errors.Is walks the unwrap chain so this also matches + // ETIME wrapped by giouring helpers via fmt.Errorf %w. + if !errors.Is(err, syscall.ETIME) { break } continue diff --git a/pkg/xtcp/netlinker_iouring.go b/pkg/xtcp/netlinker_iouring.go index c791235..de723a0 100644 --- a/pkg/xtcp/netlinker_iouring.go +++ b/pkg/xtcp/netlinker_iouring.go @@ -2,6 +2,7 @@ package xtcp import ( "context" + "errors" "log" "net" "runtime" @@ -184,19 +185,12 @@ func (x *XTCP) iouringWaitWithTimeout(ring *xio.Ring, d time.Duration) ([]xio.Re } // isETimeError returns true if the error is ETIME (io_uring's -// wait-timeout signal) or its Go equivalent. +// wait-timeout signal) — either as a bare syscall.Errno or anywhere in +// the unwrap chain (e.g. wrapped by fmt.Errorf("...: %w", err) from a +// downstream library). errors.Is walks Unwrap for us, so this also +// covers the giouring helpers' future wrapping. func isETimeError(err error) bool { - if err == nil { - return false - } - if errno, ok := err.(syscall.Errno); ok { - return errno == syscall.ETIME - } - // Fallback: match by string for wrapped errors. - if err.Error() == "errno 62" { - return true - } - return false + return errors.Is(err, syscall.ETIME) } // handleRecvCQE feeds the recv'd bytes into the deserializer and returns