diff --git a/.github/workflows/functional_tests.yml b/.github/workflows/functional_tests.yml index 59a5e15c16..0154d4120d 100644 --- a/.github/workflows/functional_tests.yml +++ b/.github/workflows/functional_tests.yml @@ -6,7 +6,7 @@ on: - main workflow_dispatch: jobs: - frank-single-transaction: + single-transaction: runs-on: group: github-v1 env: @@ -25,4 +25,4 @@ jobs: - name: Run functional tests run: | - ./src/test/frank-single-transaction.sh + ./src/test/single-transaction.sh diff --git a/ffi/rust/firedancer-sys/Cargo.toml b/ffi/rust/firedancer-sys/Cargo.toml index 8bd0acebeb..94cd40b3d6 100644 --- a/ffi/rust/firedancer-sys/Cargo.toml +++ b/ffi/rust/firedancer-sys/Cargo.toml @@ -11,7 +11,6 @@ include = [ "staging", "build.rs", "wrapper_ballet.h", - "wrapper_disco.h", "wrapper_tango.h", "wrapper_util.h", ] diff --git a/ffi/rust/firedancer-sys/build.rs b/ffi/rust/firedancer-sys/build.rs index 0a3a6d731c..921970d7a9 100644 --- a/ffi/rust/firedancer-sys/build.rs +++ b/ffi/rust/firedancer-sys/build.rs @@ -53,13 +53,11 @@ fn main() { println!("cargo:rerun-if-changed=../../../src/ballet"); println!("cargo:rerun-if-changed=wrapper_tango.h"); println!("cargo:rerun-if-changed=../../../src/tango"); - println!("cargo:rerun-if-changed=wrapper_disco.h"); - println!("cargo:rerun-if-changed=../../../src/disco"); "../../../" }; - for lib in ["util", "ballet", "tango", "disco"] { + for lib in ["util", "ballet", "tango"] { // Generate bindings to the header files let mut builder = bindgen::Builder::default() .wrap_static_fns(true) @@ -179,12 +177,9 @@ fn main() { .env("CC", "gcc") // Always use GCC for building FFI .env("BASEDIR", out_dir.join("build")); - // No statics in disco yet so no extern wrapper file is produced - if lib != "disco" { - let key = format!("{}_STATIC_EXTERN_OBJECT", lib.to_uppercase()); - let value = out_dir.join(&format!("gen_{}.c", lib)); - command.env(key, value); - } + let key = format!("{}_STATIC_EXTERN_OBJECT", lib.to_uppercase()); + let value = out_dir.join(&format!("gen_{}.c", lib)); + command.env(key, value); let output = command.output().unwrap_or_else(|_| { panic!( @@ -211,7 +206,6 @@ fn main() { ); println!("cargo:rustc-link-lib=static=fd_util"); println!("cargo:rustc-link-lib=static=fd_tango"); - println!("cargo:rustc-link-lib=static=fd_disco"); println!("cargo:rustc-link-lib=static=fd_ballet"); println!("cargo:rustc-link-lib=stdc++"); } diff --git a/ffi/rust/firedancer-sys/src/lib.rs b/ffi/rust/firedancer-sys/src/lib.rs index 58bb4078f4..cf57f0f085 100644 --- a/ffi/rust/firedancer-sys/src/lib.rs +++ b/ffi/rust/firedancer-sys/src/lib.rs @@ -19,7 +19,6 @@ mod generated { include!(concat!(env!("OUT_DIR"), "/bindings_util.rs")); include!(concat!(env!("OUT_DIR"), "/bindings_ballet.rs")); - include!(concat!(env!("OUT_DIR"), "/bindings_disco.rs")); include!(concat!(env!("OUT_DIR"), "/bindings_tango.rs")); } diff --git a/ffi/rust/firedancer-sys/src/tango/dcache.rs b/ffi/rust/firedancer-sys/src/tango/dcache.rs index 0beee3f697..492d323753 100644 --- a/ffi/rust/firedancer-sys/src/tango/dcache.rs +++ b/ffi/rust/firedancer-sys/src/tango/dcache.rs @@ -4,6 +4,8 @@ pub use crate::generated::{ fd_dcache_app_laddr_const, fd_dcache_app_sz, fd_dcache_compact_chunk0, + fd_dcache_compact_wmark, + fd_dcache_compact_next, fd_dcache_compact_is_safe, fd_dcache_data_sz, fd_dcache_delete, diff --git a/ffi/rust/firedancer-sys/wrapper_disco.h b/ffi/rust/firedancer-sys/wrapper_disco.h deleted file mode 100644 index 323f7f4bb2..0000000000 --- a/ffi/rust/firedancer-sys/wrapper_disco.h +++ /dev/null @@ -1 +0,0 @@ -#include "src/disco/fd_disco.h" diff --git a/solana b/solana index 8a5d9143fa..3f17cf0fe1 160000 --- a/solana +++ b/solana @@ -1 +1 @@ -Subproject commit 8a5d9143fae1c3bc3e0318ed5ea0c66e3031d692 +Subproject commit 3f17cf0fe1c16be765f306f3a135e281ce2804f7 diff --git a/src/app/fdctl/Local.mk b/src/app/fdctl/Local.mk index 1ffd535df0..f54345eed3 100644 --- a/src/app/fdctl/Local.mk +++ b/src/app/fdctl/Local.mk @@ -5,8 +5,8 @@ ifdef FD_HAS_DOUBLE .PHONY: fdctl run monitor cargo -$(call add-objs,main1 config security utility run keygen ready monitor/monitor monitor/helper configure/configure configure/large_pages configure/sysctl configure/shmem configure/xdp configure/xdp_leftover configure/ethtool configure/workspace_leftover configure/workspace,fd_fdctl) -$(call make-bin-rust,fdctl,main,fd_fdctl fd_frank fd_disco fd_ballet fd_tango fd_util fd_quic solana_validator_fd) +$(call add-objs,main1 config security utility run/run run/tiles/dedup run/tiles/pack run/tiles/quic run/tiles/verify keygen ready monitor/monitor monitor/helper configure/configure configure/large_pages configure/sysctl configure/shmem configure/xdp configure/xdp_leftover configure/ethtool configure/workspace_leftover configure/workspace,fd_fdctl) +$(call make-bin-rust,fdctl,main,fd_fdctl fd_disco fd_ballet fd_tango fd_util fd_quic solana_validator_fd) $(OBJDIR)/obj/app/fdctl/configure/xdp.o: src/tango/xdp/fd_xdp_redirect_prog.o $(OBJDIR)/obj/app/fdctl/config.o: src/app/fdctl/config/default.toml diff --git a/src/app/fdctl/config.c b/src/app/fdctl/config.c index 7a4a82ac21..0e12156666 100644 --- a/src/app/fdctl/config.c +++ b/src/app/fdctl/config.c @@ -1,6 +1,7 @@ #include "fdctl.h" -#include "../frank/fd_frank.h" +#include "run/run.h" + #include "../../util/net/fd_eth.h" #include @@ -24,11 +25,10 @@ find_wksp( config_t * const config, FD_LOG_ERR(( "no workspace with name `%s` found", name )); } -/* partial frank_bank definition since the tile doesn't really exist */ -static fd_frank_task_t frank_bank = { +/* partial bank definition since the tile doesn't really exist */ +static fd_tile_config_t bank = { .in_wksp = "pack_bank", .out_wksp = "bank_shred", - .extra_wksp = NULL, }; ulong @@ -38,7 +38,7 @@ memlock_max_bytes( config_t * const config ) { workspace_config_t * wksp = &config->shmem.workspaces[ j ]; #define TILE_MAX( tile ) do { \ - ulong in_bytes = 0, out_bytes = 0, extra_bytes = 0; \ + ulong in_bytes = 0, out_bytes = 0; \ if( FD_LIKELY( tile.in_wksp ) ) { \ workspace_config_t * in_wksp = find_wksp( config, tile.in_wksp ); \ in_bytes = in_wksp->num_pages * in_wksp->page_size; \ @@ -47,43 +47,33 @@ memlock_max_bytes( config_t * const config ) { workspace_config_t * out_wksp = find_wksp( config, tile.out_wksp ); \ out_bytes = out_wksp->num_pages * out_wksp->page_size; \ } \ - if( FD_LIKELY( tile.extra_wksp ) ) { \ - workspace_config_t * extra_wksp = find_wksp( config, tile.extra_wksp ); \ - extra_bytes = extra_wksp->num_pages * extra_wksp->page_size; \ - } \ memlock_max_bytes = fd_ulong_max( memlock_max_bytes, \ wksp->page_size * wksp->num_pages + \ in_bytes + \ - out_bytes + \ - extra_bytes ); \ + out_bytes ); \ } while(0) switch ( wksp->kind ) { - case wksp_tpu_txn_data: case wksp_quic_verify: case wksp_verify_dedup: case wksp_dedup_pack: case wksp_pack_bank: - case wksp_pack_forward: case wksp_bank_shred: break; case wksp_quic: - TILE_MAX( frank_quic ); + TILE_MAX( quic ); break; case wksp_verify: - TILE_MAX( frank_verify ); + TILE_MAX( verify ); break; case wksp_dedup: - TILE_MAX( frank_dedup ); + TILE_MAX( dedup ); break; case wksp_pack: - TILE_MAX( frank_pack ); + TILE_MAX( pack ); break; case wksp_bank: - TILE_MAX( frank_bank ); - break; - case wksp_forward: - TILE_MAX( frank_forward ); + TILE_MAX( bank ); break; } } @@ -301,8 +291,6 @@ static void parse_key_value( config_t * config, ENTRY_UINT ( ., tiles.bank, receive_buffer_size ); - ENTRY_UINT ( ., tiles.forward, receive_buffer_size ); - ENTRY_UINT ( ., tiles.dedup, signature_cache_size ); } @@ -480,27 +468,21 @@ static void init_workspaces( config_t * config ) { ulong idx = 0; - config->shmem.workspaces[ idx ].kind = wksp_tpu_txn_data; - config->shmem.workspaces[ idx ].name = "tpu_txn_data"; - config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; - config->shmem.workspaces[ idx ].num_pages = 1; - idx++; - config->shmem.workspaces[ idx ].kind = wksp_quic_verify; config->shmem.workspaces[ idx ].name = "quic_verify"; - config->shmem.workspaces[ idx ].page_size = FD_SHMEM_HUGE_PAGE_SZ; - config->shmem.workspaces[ idx ].num_pages = 2; + config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; + config->shmem.workspaces[ idx ].num_pages = 1; idx++; config->shmem.workspaces[ idx ].kind = wksp_verify_dedup; config->shmem.workspaces[ idx ].name = "verify_dedup"; - config->shmem.workspaces[ idx ].page_size = FD_SHMEM_HUGE_PAGE_SZ; - config->shmem.workspaces[ idx ].num_pages = 2; + config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; + config->shmem.workspaces[ idx ].num_pages = 1; idx++; config->shmem.workspaces[ idx ].kind = wksp_dedup_pack; config->shmem.workspaces[ idx ].name = "dedup_pack"; - config->shmem.workspaces[ idx ].page_size = FD_SHMEM_HUGE_PAGE_SZ; + config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; config->shmem.workspaces[ idx ].num_pages = 1; idx++; @@ -510,12 +492,6 @@ init_workspaces( config_t * config ) { config->shmem.workspaces[ idx ].num_pages = 1; idx++; - config->shmem.workspaces[ idx ].kind = wksp_pack_forward; - config->shmem.workspaces[ idx ].name = "pack_forward"; - config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; - config->shmem.workspaces[ idx ].num_pages = 1; - idx++; - config->shmem.workspaces[ idx ].kind = wksp_bank_shred; config->shmem.workspaces[ idx ].name = "bank_shred"; config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; @@ -552,12 +528,6 @@ init_workspaces( config_t * config ) { config->shmem.workspaces[ idx ].num_pages = 1; idx++; - config->shmem.workspaces[ idx ].kind = wksp_forward; - config->shmem.workspaces[ idx ].name = "forward"; - config->shmem.workspaces[ idx ].page_size = FD_SHMEM_GIGANTIC_PAGE_SZ; - config->shmem.workspaces[ idx ].num_pages = 1; - idx++; - for( ulong i=0; ilayout.bank_tile_count; i++ ) { config->shmem.workspaces[ idx ].kind = wksp_bank; config->shmem.workspaces[ idx ].name = "bank"; diff --git a/src/app/fdctl/config.h b/src/app/fdctl/config.h index d9a3e8f92b..07823cfe8d 100644 --- a/src/app/fdctl/config.h +++ b/src/app/fdctl/config.h @@ -14,19 +14,16 @@ typedef struct { enum { - wksp_tpu_txn_data, wksp_quic_verify, wksp_verify_dedup, wksp_dedup_pack, wksp_pack_bank, - wksp_pack_forward, wksp_bank_shred, wksp_quic, wksp_verify, wksp_dedup, wksp_pack, wksp_bank, - wksp_forward, } kind; char * name; ulong kind_idx; @@ -163,10 +160,6 @@ typedef struct { uint receive_buffer_size; } bank; - struct { - uint receive_buffer_size; - } forward; - struct { uint signature_cache_size; } dedup; diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index 2da3890485..072b9eb3e9 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -306,7 +306,7 @@ dynamic_port_range = "8000-10000" # # It is suggested to use all available CPU cores for Firedancer, so that the # Solana network can run as fast as possible. - affinity = "0-14" + affinity = "0-13" # How many verify tiles to run. Currently this also configures the number of # QUIC tiles to run. QUIC and verify tiles are connected 1:1. @@ -480,12 +480,6 @@ dynamic_port_range = "8000-10000" # dropped. TODO: ... Should this really be configurable? receive_buffer_size = 16384 - # The maximum size of a message from a QUIC tile to a verify tile. - # - # TODO: This should be removed. Not configuration, should be transaction - # MTU. - mtu = 4804 - # The pack tile takes incoming transactions that have been verified by the # verify tile and attempts to order them in an optimal way to generate the # most fees per compute resource used to execute them. @@ -520,12 +514,6 @@ dynamic_port_range = "8000-10000" [tiles.bank] receive_buffer_size = 128 - # The forward tile sends unprocessed transactions. - # Forwarding only occures when the current instance is the leader and it - # is unable to process all transactions in its leader slot. - [tiles.forward] - receive_buffer_size = 128 - # All transactions entering into the validator are deduplicated after their # signature is verified, to ensure the same transaction is not repeated # multiple times. diff --git a/src/app/fdctl/configure/large_pages.c b/src/app/fdctl/configure/large_pages.c index 3cd5ff71d7..b5f9c2bd5e 100644 --- a/src/app/fdctl/configure/large_pages.c +++ b/src/app/fdctl/configure/large_pages.c @@ -52,12 +52,10 @@ expected_pages( config_t * const config, uint out[2] ) { for( ulong i=0; ishmem.workspaces_cnt; i++ ) { switch( config->shmem.workspaces[ i ].kind ) { - case wksp_tpu_txn_data: case wksp_quic_verify: case wksp_verify_dedup: case wksp_dedup_pack: case wksp_pack_bank: - case wksp_pack_forward: case wksp_bank_shred: break; case wksp_quic: @@ -65,7 +63,6 @@ expected_pages( config_t * const config, uint out[2] ) { case wksp_dedup: case wksp_pack: case wksp_bank: - case wksp_forward: num_tiles++; break; } diff --git a/src/app/fdctl/configure/workspace.c b/src/app/fdctl/configure/workspace.c index bb15483039..252302a67f 100644 --- a/src/app/fdctl/configure/workspace.c +++ b/src/app/fdctl/configure/workspace.c @@ -1,5 +1,7 @@ #include "configure.h" +#include "../../../disco/fd_disco.h" + #include "../../../tango/fd_tango.h" #include "../../../tango/quic/fd_quic.h" #include "../../../tango/xdp/fd_xsk_aio.h" @@ -67,13 +69,6 @@ static void fseq( void * pod, char * fmt, ... ) { fd_fseq_new ( shmem, 0 ) ); } -static void tcache( void * pod, char * fmt, ulong depth, ... ) { - INSERTER( depth, - fd_tcache_align ( ), - fd_tcache_footprint( depth, 0 ), - fd_tcache_new ( shmem, depth, 0 ) ); -} - static void quic( void * pod, char * fmt, fd_quic_limits_t * limits, ... ) { INSERTER( limits, fd_quic_align ( ), @@ -233,15 +228,11 @@ init( config_t * const config ) { WKSP_BEGIN( config, wksp1, 0 ); switch( wksp1->kind ) { - case wksp_tpu_txn_data: - for( ulong i=0; ilayout.verify_tile_count; i++ ) { - dcache( pod, "dcache%lu", config->tiles.verify.mtu, config->tiles.verify.receive_buffer_size, config->tiles.verify.receive_buffer_size * 32, i ); - } - break; case wksp_quic_verify: for( ulong i=0; ilayout.verify_tile_count; i++ ) { mcache( pod, "mcache%lu", config->tiles.verify.receive_buffer_size, i ); fseq ( pod, "fseq%lu", i ); + dcache( pod, "dcache%lu", FD_TPU_DCACHE_MTU, config->tiles.verify.receive_buffer_size, config->tiles.verify.receive_buffer_size * 32, i ); } break; case wksp_verify_dedup: @@ -249,27 +240,23 @@ init( config_t * const config ) { for( ulong i=0; ilayout.verify_tile_count; i++ ) { mcache( pod, "mcache%lu", config->tiles.verify.receive_buffer_size, i ); fseq ( pod, "fseq%lu", i ); + dcache( pod, "dcache%lu", FD_TPU_DCACHE_MTU, config->tiles.verify.receive_buffer_size, 0, i ); } break; case wksp_dedup_pack: mcache( pod, "mcache", config->tiles.verify.receive_buffer_size ); fseq ( pod, "fseq" ); + dcache( pod, "dcache", FD_TPU_DCACHE_MTU, config->tiles.verify.receive_buffer_size, 0 ); break; case wksp_pack_bank: - ulong1( pod, "num_tiles", config->layout.bank_tile_count ); + ulong1( pod, "cnt", config->layout.bank_tile_count ); + mcache( pod, "mcache", config->tiles.bank.receive_buffer_size ); + dcache( pod, "dcache", USHORT_MAX, config->layout.bank_tile_count * (ulong)config->tiles.bank.receive_buffer_size, 0 ); for( ulong i=0; ilayout.bank_tile_count; i++ ) { - mcache( pod, "mcache%lu", config->tiles.bank.receive_buffer_size, i ); - dcache( pod, "dcache%lu", USHORT_MAX, config->layout.bank_tile_count * (ulong)config->tiles.bank.receive_buffer_size, 0, i ); - fseq ( pod, "fseq%lu", i ); - mcache( pod, "mcache-back%lu", config->tiles.bank.receive_buffer_size, i ); - fseq ( pod, "fseq-back%lu", i ); + fseq( pod, "fseq%lu", i ); + fseq( pod, "busy%lu", i ); } break; - case wksp_pack_forward: - mcache( pod, "mcache", config->tiles.forward.receive_buffer_size ); - dcache( pod, "dcache", USHORT_MAX, (ulong)config->tiles.forward.receive_buffer_size, 0 ); - fseq ( pod, "fseq" ); - break; case wksp_bank_shred: for( ulong i=0; ilayout.bank_tile_count; i++ ) { mcache( pod, "mcache%lu", config->tiles.bank.receive_buffer_size, i ); @@ -313,7 +300,7 @@ init( config_t * const config ) { break; case wksp_dedup: cnc ( pod, "cnc", pod, wksp ); - tcache( pod, "tcache", config->tiles.dedup.signature_cache_size ); + ulong1( pod, "tcache_depth", config->tiles.dedup.signature_cache_size ); break; case wksp_pack: cnc ( pod, "cnc" ); @@ -322,9 +309,6 @@ init( config_t * const config ) { case wksp_bank: cnc ( pod, "cnc" ); break; - case wksp_forward: - cnc ( pod, "cnc" ); - break; } WKSP_END(); @@ -377,8 +361,8 @@ check( config_t * const config ) { configure_stage_t workspace = { .name = NAME, - /* we can't really verify if a frank workspace has been set up - correctly, so if we are running it we just recreate it every time */ + /* we can't really verify if a workspace has been set up correctly, so + if we are running it we just recreate it every time */ .always_recreate = 1, .enabled = NULL, .init_perm = init_perm, diff --git a/src/app/fdctl/monitor/monitor.c b/src/app/fdctl/monitor/monitor.c index c1cef8c790..5ac8e36d78 100644 --- a/src/app/fdctl/monitor/monitor.c +++ b/src/app/fdctl/monitor/monitor.c @@ -1,7 +1,7 @@ #include "../fdctl.h" #include "helper.h" -#include "../run.h" +#include "../run/run.h" #include "../../../disco/fd_disco.h" #include @@ -90,13 +90,13 @@ tile_snap( tile_snap_t * snap_cur, /* Snaphot for each tile, indexed [0, snap->cnc_signal = fd_cnc_signal_query ( cnc ); ulong const * cnc_diag = (ulong const *)fd_cnc_app_laddr_const( cnc ); FD_COMPILER_MFENCE(); - snap->cnc_diag_pid = cnc_diag[ FD_FRANK_CNC_DIAG_PID ]; - snap->cnc_diag_in_backp = cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ]; - snap->cnc_diag_backp_cnt = cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ]; - snap->cnc_diag_ha_filt_cnt = cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ]; - snap->cnc_diag_ha_filt_sz = cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ]; - snap->cnc_diag_sv_filt_cnt = cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ]; - snap->cnc_diag_sv_filt_sz = cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ]; + snap->cnc_diag_pid = cnc_diag[ FD_APP_CNC_DIAG_PID ]; + snap->cnc_diag_in_backp = cnc_diag[ FD_APP_CNC_DIAG_IN_BACKP ]; + snap->cnc_diag_backp_cnt = cnc_diag[ FD_APP_CNC_DIAG_BACKP_CNT ]; + snap->cnc_diag_ha_filt_cnt = cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_CNT ]; + snap->cnc_diag_ha_filt_sz = cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_SZ ]; + snap->cnc_diag_sv_filt_cnt = cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_CNT ]; + snap->cnc_diag_sv_filt_sz = cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_SZ ]; FD_COMPILER_MFENCE(); } } @@ -195,21 +195,30 @@ run_monitor( config_t * const config, long duration, uint seed, double ns_per_tic ) { - ulong tile_cnt = - config->layout.verify_tile_count + // QUIC tiles - config->layout.verify_tile_count + // verify tiles - 1 + // dedup tile - 1 + // pack tile - config->layout.bank_tile_count + // bank tiles - 1; // forward tile + ulong tile_cnt = 0; + for( ulong i=0; ishmem.workspaces_cnt; i++ ) { + switch( config->shmem.workspaces[ i ].kind ) { + case wksp_quic_verify: + case wksp_verify_dedup: + case wksp_dedup_pack: + case wksp_pack_bank: + case wksp_bank_shred: + break; + case wksp_quic: + case wksp_verify: + case wksp_dedup: + case wksp_pack: + case wksp_bank: + tile_cnt++; + break; + } + } ulong link_cnt = config->layout.verify_tile_count + // quic <-> verify config->layout.verify_tile_count + // verify <-> dedup 1 + // dedup <-> pack - config->layout.bank_tile_count + // pack <-> bank - config->layout.bank_tile_count + // bank <-> pack - 1; // pack <-> forward + config->layout.bank_tile_count; // pack <-> bank tile_t * tiles = fd_alloca( alignof(tile_t *), sizeof(tile_t)*tile_cnt ); link_t * links = fd_alloca( alignof(link_t *), sizeof(link_t)*link_cnt ); @@ -223,8 +232,6 @@ run_monitor( config_t * const config, char buf[ 64 ]; switch( wksp->kind ) { - case wksp_tpu_txn_data: - break; case wksp_quic_verify: for( ulong i=0; ilayout.verify_tile_count; i++ ) { links[ link_idx ].src_name = "quic"; @@ -260,30 +267,13 @@ run_monitor( config_t * const config, for( ulong i=0; ilayout.bank_tile_count; i++ ) { links[ link_idx ].src_name = "pack"; links[ link_idx ].dst_name = "bank"; - links[ link_idx ].mcache = fd_mcache_join( fd_wksp_pod_map( pods[ j ], snprintf1( buf, 64, "mcache%lu", i ) ) ); + links[ link_idx ].mcache = fd_mcache_join( fd_wksp_pod_map( pods[ j ], "mcache" ) ); /* shared mcache from mux tile */ if( FD_UNLIKELY( !links[ link_idx ].mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); links[ link_idx ].fseq = fd_fseq_join( fd_wksp_pod_map( pods[ j ], snprintf1( buf, 64, "fseq%lu", i ) ) ); if( FD_UNLIKELY( !links[ link_idx ].fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); link_idx++; - - links[ link_idx ].src_name = "bank"; - links[ link_idx ].dst_name = "pack"; - links[ link_idx ].mcache = fd_mcache_join( fd_wksp_pod_map( pods[ j ], snprintf1( buf, 64, "mcache-back%lu", i ) ) ); - if( FD_UNLIKELY( !links[ link_idx ].mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - links[ link_idx ].fseq = fd_fseq_join( fd_wksp_pod_map( pods[ j ], snprintf1( buf, 64, "fseq-back%lu", i ) ) ); - if( FD_UNLIKELY( !links[ link_idx ].fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - link_idx++; } break; - case wksp_pack_forward: - links[ link_idx ].src_name = "pack"; - links[ link_idx ].dst_name = "forward"; - links[ link_idx ].mcache = fd_mcache_join( fd_wksp_pod_map( pods[ j ], "mcache" ) ); - if( FD_UNLIKELY( !links[ link_idx ].mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - links[ link_idx ].fseq = fd_fseq_join( fd_wksp_pod_map( pods[ j ], "fseq" ) ); - if( FD_UNLIKELY( !links[ link_idx ].fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - link_idx++; - break; case wksp_bank_shred: break; case wksp_quic: @@ -321,13 +311,6 @@ run_monitor( config_t * const config, if( FD_UNLIKELY( fd_cnc_app_sz( tiles[ tile_idx ].cnc )<64UL ) ) FD_LOG_ERR(( "cnc app sz should be at least 64 bytes" )); tile_idx++; break; - case wksp_forward: - tiles[ tile_idx ].name = "forward"; - tiles[ tile_idx ].cnc = fd_cnc_join( fd_wksp_pod_map( pod, "cnc" ) ); - if( FD_UNLIKELY( !tiles[ tile_idx ].cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - if( FD_UNLIKELY( fd_cnc_app_sz( tiles[ tile_idx ].cnc )<64UL ) ) FD_LOG_ERR(( "cnc app sz should be at least 64 bytes" )); - tile_idx++; - break; } } diff --git a/src/app/fdctl/ready.c b/src/app/fdctl/ready.c index 7d61d9f781..0107d3aa8b 100644 --- a/src/app/fdctl/ready.c +++ b/src/app/fdctl/ready.c @@ -1,6 +1,6 @@ #include "fdctl.h" -#include "run.h" +#include "run/run.h" #include "../../tango/fd_tango.h" @@ -12,15 +12,12 @@ ready_cmd_fn( args_t * args, for( ulong i=0; ishmem.workspaces_cnt; i++ ) { workspace_config_t * wksp = &config->shmem.workspaces[i]; switch( wksp->kind ) { - case wksp_tpu_txn_data: case wksp_quic_verify: case wksp_verify_dedup: case wksp_dedup_pack: case wksp_pack_bank: - case wksp_pack_forward: case wksp_bank_shred: case wksp_bank: - case wksp_forward: break; case wksp_quic: case wksp_verify: diff --git a/src/app/fdctl/run.h b/src/app/fdctl/run.h deleted file mode 100644 index 59a938492d..0000000000 --- a/src/app/fdctl/run.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef HEADER_fd_src_app_fdctl_run_h -#define HEADER_fd_src_app_fdctl_run_h - -#include "fdctl.h" -#include "../frank/fd_frank.h" - -typedef struct { - fd_frank_task_t * tile; - ulong tile_idx; - ulong idx; - int sandbox; - uid_t uid; - gid_t gid; - char * app_name; - double tick_per_ns; -} tile_main_args_t; - -const uchar * -workspace_pod_join( char * app_name, - char * tile_name, - ulong tile_idx ); - -int -solana_labs_main( void * args ); - -int -tile_main( void * _args ); - -void -run_firedancer( config_t * const config ); - -#endif /* HEADER_fd_src_app_fdctl_run_h */ diff --git a/src/app/fdctl/run.c b/src/app/fdctl/run/run.c similarity index 93% rename from src/app/fdctl/run.c rename to src/app/fdctl/run/run.c index 818ed9e630..f5d6fc3494 100644 --- a/src/app/fdctl/run.c +++ b/src/app/fdctl/run/run.c @@ -1,7 +1,9 @@ #define _GNU_SOURCE #include "run.h" -#include "configure/configure.h" +#include "../configure/configure.h" + +#include "../../../util/wksp/fd_wksp_private.h" #include #include @@ -11,8 +13,6 @@ #include #include -#include "../../util/wksp/fd_wksp_private.h" - void run_cmd_perm( args_t * args, security_t * security, @@ -61,7 +61,6 @@ typedef struct { char child_names[ FD_TILE_MAX + 1 ][ 32 ]; uid_t uid; gid_t gid; - double tick_per_ns; } tile_spawner_t; const uchar * @@ -106,7 +105,7 @@ tile_main( void * _args ) { FD_LOG_NOTICE(( "booting tile %s(%lu) pid(%d)", args->tile->name, args->tile_idx, pid )); install_tile_signals(); - fd_frank_args_t frank_args = { + fd_tile_args_t tile_args = { .pid = pid, .tile_idx = args->tile_idx, .idx = args->idx, @@ -114,22 +113,18 @@ tile_main( void * _args ) { .tile_name = args->tile->name, .in_pod = NULL, .out_pod = NULL, - .extra_pod = NULL, - .tick_per_ns = args->tick_per_ns, }; - frank_args.tile_pod = workspace_pod_join( args->app_name, args->tile->name, args->tile_idx ); + tile_args.tile_pod = workspace_pod_join( args->app_name, args->tile->name, args->tile_idx ); if( FD_LIKELY( args->tile->in_wksp ) ) - frank_args.in_pod = workspace_pod_join( args->app_name, args->tile->in_wksp, 0 ); + tile_args.in_pod = workspace_pod_join( args->app_name, args->tile->in_wksp, 0 ); if( FD_LIKELY( args->tile->out_wksp ) ) - frank_args.out_pod = workspace_pod_join( args->app_name, args->tile->out_wksp, 0 ); - if( FD_LIKELY( args->tile->extra_wksp ) ) - frank_args.extra_pod = workspace_pod_join( args->app_name, args->tile->extra_wksp, 0 ); + tile_args.out_pod = workspace_pod_join( args->app_name, args->tile->out_wksp, 0 ); - if( FD_UNLIKELY( args->tile->init ) ) args->tile->init( &frank_args ); + if( FD_UNLIKELY( args->tile->init ) ) args->tile->init( &tile_args ); int allow_fds[ 32 ]; - ulong allow_fds_sz = args->tile->allow_fds( &frank_args, + ulong allow_fds_sz = args->tile->allow_fds( &tile_args, sizeof(allow_fds)/sizeof(allow_fds[0]), allow_fds ); @@ -140,12 +135,12 @@ tile_main( void * _args ) { allow_fds, args->tile->allow_syscalls_sz, args->tile->allow_syscalls ); - args->tile->run( &frank_args ); + args->tile->run( &tile_args ); return 0; } static void -clone_tile( tile_spawner_t * spawn, fd_frank_task_t * task, ulong idx ) { +clone_tile( tile_spawner_t * spawn, fd_tile_config_t * tile, ulong idx ) { ushort cpu_idx = spawn->tile_to_cpu[ spawn->idx ]; cpu_set_t cpu_set[1]; if( FD_LIKELY( cpu_idx<65535UL ) ) { @@ -176,11 +171,10 @@ clone_tile( tile_spawner_t * spawn, fd_frank_task_t * task, ulong idx ) { .app_name = spawn->app_name, .tile_idx = idx, .idx = spawn->idx, - .tile = task, + .tile = tile, .sandbox = spawn->sandbox, .uid = spawn->uid, .gid = spawn->gid, - .tick_per_ns = spawn->tick_per_ns, }; /* also spawn tiles into pid namespaces so they cannot signal each other or the parent */ @@ -189,7 +183,7 @@ clone_tile( tile_spawner_t * spawn, fd_frank_task_t * task, ulong idx ) { if( FD_UNLIKELY( pid<0 ) ) FD_LOG_ERR(( "clone() failed (%i-%s)", errno, fd_io_strerror( errno ) )); spawn->child_pids[ spawn->idx ] = pid; - strncpy( spawn->child_names[ spawn->idx ], task->name, 32 ); + strncpy( spawn->child_names[ spawn->idx ], tile->name, 32 ); spawn->idx++; } @@ -348,12 +342,10 @@ main_pid_namespace( void * args ) { ulong tile_cnt = 0; for( ulong i=0; ishmem.workspaces_cnt; i++ ) { switch( config->shmem.workspaces[ i ].kind ) { - case wksp_tpu_txn_data: case wksp_quic_verify: case wksp_verify_dedup: case wksp_dedup_pack: case wksp_pack_bank: - case wksp_pack_forward: case wksp_bank_shred: break; case wksp_quic: @@ -361,7 +353,6 @@ main_pid_namespace( void * args ) { case wksp_dedup: case wksp_pack: case wksp_bank: - case wksp_forward: tile_cnt++; break; } @@ -369,9 +360,6 @@ main_pid_namespace( void * args ) { if( FD_UNLIKELY( affinity_tile_cnttile_cnt ) ) FD_LOG_WARNING(( "only %lu tiles required for this config", tile_cnt )); - /* eat calibration cost at deterministic place */ - double tick_per_ns = fd_tempo_tick_per_ns( NULL ); - /* Save the current affinity, it will be restored after creating any child tiles */ cpu_set_t floating_cpu_set[1]; if( FD_UNLIKELY( sched_getaffinity( 0, sizeof(cpu_set_t), floating_cpu_set ) ) ) @@ -385,7 +373,6 @@ main_pid_namespace( void * args ) { .sandbox = config->development.sandbox, .uid = config->uid, .gid = config->gid, - .tick_per_ns = tick_per_ns, }; clone_solana_labs( &spawner, config ); @@ -395,11 +382,10 @@ main_pid_namespace( void * args ) { close_network_namespace_original_fd(); } - for( ulong i=0; ilayout.verify_tile_count; i++ ) clone_tile( &spawner, &frank_quic, i ); - for( ulong i=0; ilayout.verify_tile_count; i++ ) clone_tile( &spawner, &frank_verify, i ); - clone_tile( &spawner, &frank_dedup, 0 ); - clone_tile( &spawner, &frank_pack , 0 ); - clone_tile( &spawner, &frank_forward , 0 ); + for( ulong i=0; ilayout.verify_tile_count; i++ ) clone_tile( &spawner, &quic, i ); + for( ulong i=0; ilayout.verify_tile_count; i++ ) clone_tile( &spawner, &verify, i ); + clone_tile( &spawner, &dedup, 0 ); + clone_tile( &spawner, &pack , 0 ); if( FD_UNLIKELY( sched_setaffinity( 0, sizeof(cpu_set_t), floating_cpu_set ) ) ) FD_LOG_ERR(( "sched_setaffinity failed (%i-%s)", errno, fd_io_strerror( errno ) )); diff --git a/src/app/fdctl/run/run.h b/src/app/fdctl/run/run.h new file mode 100644 index 0000000000..f342ccd0d1 --- /dev/null +++ b/src/app/fdctl/run/run.h @@ -0,0 +1,61 @@ +#ifndef HEADER_fd_src_app_fdctl_run_h +#define HEADER_fd_src_app_fdctl_run_h + +#include "../fdctl.h" + +#include "../../../tango/xdp/fd_xsk.h" + +typedef struct { + int pid; + char * app_name; + char * tile_name; + ulong tile_idx; + ulong idx; + uchar const * tile_pod; + uchar const * in_pod; + uchar const * out_pod; + fd_xsk_t * xsk; + fd_xsk_t * lo_xsk; +} fd_tile_args_t; + +typedef struct { + char * name; + char * in_wksp; + char * out_wksp; + ushort allow_syscalls_sz; + long * allow_syscalls; + ulong (*allow_fds)( fd_tile_args_t * args, ulong out_fds_sz, int * out_fds ); + void (*init)( fd_tile_args_t * args ); + void (*run )( fd_tile_args_t * args ); +} fd_tile_config_t; + +extern fd_tile_config_t verify; +extern fd_tile_config_t dedup; +extern fd_tile_config_t quic; +extern fd_tile_config_t pack; + +typedef struct { + fd_tile_config_t * tile; + ulong tile_idx; + ulong idx; + int sandbox; + uid_t uid; + gid_t gid; + char * app_name; +} tile_main_args_t; + +const uchar * +workspace_pod_join( char * app_name, + char * tile_name, + ulong tile_idx ); + +int +solana_labs_main( void * args ); + +int +tile_main( void * _args ); + +void +run_firedancer( config_t * const config ); + +#endif /* HEADER_fd_src_app_fdctl_run_h */ diff --git a/src/app/fdctl/run/tiles/dedup.c b/src/app/fdctl/run/tiles/dedup.c new file mode 100644 index 0000000000..20edc86941 --- /dev/null +++ b/src/app/fdctl/run/tiles/dedup.c @@ -0,0 +1,82 @@ +#include "../../fdctl.h" +#include "../run.h" + +#include "../../../../disco/fd_disco.h" + +#include + +static void +init( fd_tile_args_t * args ) { + (void)args; + + /* calling fd_tempo_tick_per_ns requires nanosleep, it is cached with + a FD_ONCE */ + fd_tempo_tick_per_ns( NULL ); +} + +static void +run( fd_tile_args_t * args ) { + ulong in_cnt = fd_pod_query_ulong( args->in_pod, "cnt", 0UL ); + + fd_frag_meta_t const ** in_mcache = (fd_frag_meta_t const **)fd_alloca( alignof(fd_frag_meta_t const *), sizeof(fd_frag_meta_t const *)*in_cnt ); + const uchar ** in_dcache = (const uchar **)fd_alloca( alignof(ulong *), sizeof(ulong *)*in_cnt ); + ulong ** in_fseq = (ulong **)fd_alloca( alignof(ulong *), sizeof(ulong *)*in_cnt ); + if( FD_UNLIKELY( !in_mcache || !in_dcache || !in_fseq ) ) FD_LOG_ERR(( "fd_alloca failed" )); + + for( ulong i=0; iin_pod, mcache ) ); + in_dcache[i] = fd_dcache_join( fd_wksp_pod_map( args->in_pod, dcache ) ); + in_fseq[i] = fd_fseq_join ( fd_wksp_pod_map( args->in_pod, fseq ) ); + } + + ulong tcache_depth = fd_pod_query_ulong( args->tile_pod, "tcache_depth", 0UL ); + + fd_rng_t _rng[1]; + fd_dedup_tile( fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ), + (ulong)args->pid, + in_cnt, + in_mcache, + in_fseq, + in_dcache, + fd_tcache_join( fd_tcache_new( fd_wksp_alloc_laddr( fd_wksp_containing( args->tile_pod ), FD_TCACHE_ALIGN, FD_TCACHE_FOOTPRINT( tcache_depth, 0 ), 1UL ), tcache_depth, 0 ) ), + fd_mcache_join( fd_wksp_pod_map( args->out_pod, "mcache" ) ), + fd_dcache_join( fd_wksp_pod_map( args->out_pod, "dcache" ) ), + 1, + &(ulong*){ fd_fseq_join( fd_wksp_pod_map( args->out_pod, "fseq" ) ) }, + 0, + 0, + fd_rng_join( fd_rng_new( _rng, 0, 0UL ) ), + fd_alloca( FD_DEDUP_TILE_SCRATCH_ALIGN, FD_DEDUP_TILE_SCRATCH_FOOTPRINT( in_cnt, 1 ) ) ); +} + +static long allow_syscalls[] = { + __NR_write, /* logging */ + __NR_fsync, /* logging, WARNING and above fsync immediately */ +}; + +static ulong +allow_fds( fd_tile_args_t * args, + ulong out_fds_sz, + int * out_fds ) { + (void)args; + if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); + out_fds[ 0 ] = 2; /* stderr */ + out_fds[ 1 ] = 3; /* logfile */ + return 2; +} + +fd_tile_config_t dedup = { + .name = "dedup", + .in_wksp = "verify_dedup", + .out_wksp = "dedup_pack", + .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), + .allow_syscalls = allow_syscalls, + .allow_fds = allow_fds, + .init = init, + .run = run, +}; diff --git a/src/app/fdctl/run/tiles/pack.c b/src/app/fdctl/run/tiles/pack.c new file mode 100644 index 0000000000..2788ea3a11 --- /dev/null +++ b/src/app/fdctl/run/tiles/pack.c @@ -0,0 +1,90 @@ +#include "../../fdctl.h" +#include "../run.h" + +#include "../../../../disco/fd_disco.h" + +#include + +#define FD_PACK_TAG (0x17ac1C711eUL) + +static void +init( fd_tile_args_t * args ) { + (void)args; + + /* calling fd_tempo_tick_per_ns requires nanosleep, it is cached with + a FD_ONCE */ + fd_tempo_tick_per_ns( NULL ); +} + +static void +run( fd_tile_args_t * args ) { + ulong out_cnt = fd_pod_query_ulong( args->out_pod, "cnt", 0UL ); + if( FD_UNLIKELY( !out_cnt ) ) FD_LOG_ERR(( "num_tiles unset or set to zero" )); + + ulong ** out_fseq = (ulong **)fd_alloca( alignof(ulong *), sizeof(ulong *)*out_cnt ); + ulong ** out_busy = (ulong **)fd_alloca( alignof(ulong *), sizeof(ulong *)*out_cnt ); + if( FD_UNLIKELY( !out_fseq || !out_busy ) ) FD_LOG_ERR(( "fd_alloca failed" )); + + for( ulong i=0; iout_pod, fseq ) ); + out_busy[i] = fd_fseq_join( fd_wksp_pod_map( args->out_pod, busy ) ); + } + + ulong pack_depth = fd_pod_query_ulong( args->tile_pod, "depth", 0UL ); + if( FD_UNLIKELY( !pack_depth ) ) FD_LOG_ERR(( "depth unset or set to zero" )); + + ulong max_txn_per_microblock = MAX_MICROBLOCK_SZ/sizeof(fd_txn_p_t); + ulong pack_footprint = fd_pack_footprint( pack_depth, out_cnt, max_txn_per_microblock ); + + FD_LOG_INFO(( "packing blocks of at most %lu transactions to %lu bank tiles", max_txn_per_microblock, out_cnt )); + + fd_rng_t _rng[1]; + fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, 0, 0UL ) ); + fd_pack_tile( fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ), + (ulong)args->pid, + 1, + (const fd_frag_meta_t **)&(fd_frag_meta_t*){ fd_mcache_join( fd_wksp_pod_map( args->in_pod, "mcache" ) ) }, + &(ulong*){ fd_fseq_join( fd_wksp_pod_map( args->in_pod, "fseq" ) ) }, + (const uchar **)&(uchar*){ fd_dcache_join( fd_wksp_pod_map( args->in_pod, "dcache" ) ) }, + fd_pack_join( fd_pack_new( fd_wksp_alloc_laddr( fd_wksp_containing( args->tile_pod ), fd_pack_align(), pack_footprint, FD_PACK_TAG ), pack_depth, out_cnt, max_txn_per_microblock, rng ) ), + fd_mcache_join( fd_wksp_pod_map( args->out_pod, "mcache" ) ), + fd_dcache_join( fd_wksp_pod_map( args->out_pod, "dcache" ) ), + out_cnt, + out_fseq, + out_busy, + 0, + 0, + rng, + fd_alloca( FD_PACK_TILE_SCRATCH_ALIGN, FD_PACK_TILE_SCRATCH_FOOTPRINT( 1, out_cnt ) ) ); +} + +static long allow_syscalls[] = { + __NR_write, /* logging */ + __NR_fsync, /* logging, WARNING and above fsync immediately */ +}; + +static ulong +allow_fds( fd_tile_args_t * args, + ulong out_fds_sz, + int * out_fds ) { + (void)args; + if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); + out_fds[ 0 ] = 2; /* stderr */ + out_fds[ 1 ] = 3; /* logfile */ + return 2; +} + +fd_tile_config_t pack = { + .name = "pack", + .in_wksp = "dedup_pack", + .out_wksp = "pack_bank", + .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), + .allow_syscalls = allow_syscalls, + .allow_fds = allow_fds, + .init = init, + .run = run, +}; diff --git a/src/app/fdctl/run/tiles/quic.c b/src/app/fdctl/run/tiles/quic.c new file mode 100644 index 0000000000..bde1946bca --- /dev/null +++ b/src/app/fdctl/run/tiles/quic.c @@ -0,0 +1,142 @@ +#include "../../fdctl.h" +#include "../run.h" + +#include "../../../../disco/fd_disco.h" +#include "../../../../tango/xdp/fd_xsk_private.h" + +#include + +#include +#include + +static void +init( fd_tile_args_t * args ) { + FD_LOG_INFO(( "loading %s", "xsk" )); + args->xsk = fd_xsk_join( fd_wksp_pod_map( args->tile_pod, "xsk" ) ); + if( FD_UNLIKELY( !args->xsk ) ) FD_LOG_ERR(( "fd_xsk_join failed" )); + + args->lo_xsk = NULL; + if( FD_UNLIKELY( fd_pod_query_cstr( args->tile_pod, "lo_xsk", NULL ) ) ) { + FD_LOG_INFO(( "loading %s", "lo_xsk" )); + args->lo_xsk = fd_xsk_join( fd_wksp_pod_map( args->tile_pod, "lo_xsk" ) ); + if( FD_UNLIKELY( !args->lo_xsk ) ) FD_LOG_ERR(( "fd_xsk_join (lo) failed" )); + } + + /* call wallclock so glibc loads VDSO, which requires calling mmap while + privileged */ + fd_log_wallclock(); + + /* calling fd_tempo_tick_per_ns requires nanosleep, it is cached with + a FD_ONCE */ + fd_tempo_tick_per_ns( NULL ); + + /* OpenSSL goes and tries to read files and allocate memory and + other dumb things on a thread local basis, so we need a special + initializer to do it before seccomp happens in the process. */ + ERR_STATE * state = ERR_get_state(); + if( FD_UNLIKELY( !state )) FD_LOG_ERR(( "ERR_get_state failed" )); + if( FD_UNLIKELY( !OPENSSL_init_ssl( OPENSSL_INIT_LOAD_SSL_STRINGS , NULL ) ) ) + FD_LOG_ERR(( "OPENSSL_init_ssl failed" )); + if( FD_UNLIKELY( !OPENSSL_init_crypto( OPENSSL_INIT_LOAD_CRYPTO_STRINGS | OPENSSL_INIT_NO_LOAD_CONFIG , NULL ) ) ) + FD_LOG_ERR(( "OPENSSL_init_crypto failed" )); +} + +static ushort +initialize_quic( fd_quic_config_t * config, uchar const * pod ) { + uint ip_addr = fd_pod_query_uint( pod, "ip_addr", 0 ); + if( FD_UNLIKELY( !ip_addr ) ) FD_LOG_ERR(( "ip_addr not set" )); + + const void * src_mac = fd_pod_query_buf( pod, "src_mac_addr", NULL ); + if( FD_UNLIKELY( !src_mac ) ) FD_LOG_ERR(( "src_mac_addr not set" )); + + ushort transaction_listen_port = fd_pod_query_ushort( pod, "transaction_listen_port", 0 ); + if( FD_UNLIKELY( !transaction_listen_port ) ) FD_LOG_ERR(( "transaction_listen_port not set" )); + + ushort quic_transaction_listen_port = fd_pod_query_ushort( pod, "quic_transaction_listen_port", 0 ); + if( FD_UNLIKELY( !quic_transaction_listen_port ) ) FD_LOG_ERR(( "quic_transaction_listen_port not set" )); + + ulong idle_timeout_ms = fd_pod_query_ulong( pod, "idle_timeout_ms", 0 ); + if( FD_UNLIKELY( !idle_timeout_ms ) ) FD_LOG_ERR(( "idle_timeout_ms not set" )); + + ulong initial_rx_max_stream_data = fd_pod_query_ulong( pod, "initial_rx_max_stream_data", 1<<15 ); + if( FD_UNLIKELY( !initial_rx_max_stream_data ) ) FD_LOG_ERR(( "initial_rx_max_stream_data not set" )); + + config->role = FD_QUIC_ROLE_SERVER; + config->net.ip_addr = ip_addr; + fd_memcpy( config->link.src_mac_addr, src_mac, 6 ); + config->net.listen_udp_port = quic_transaction_listen_port; + config->idle_timeout = idle_timeout_ms * 1000000UL; + config->initial_rx_max_stream_data = initial_rx_max_stream_data; + + return transaction_listen_port; +} + +static void +run( fd_tile_args_t * args ) { + char _mcache[32], fseq[32], dcache[32]; + snprintf( _mcache, sizeof(_mcache), "mcache%lu", args->tile_idx ); + snprintf( fseq, sizeof(fseq), "fseq%lu", args->tile_idx ); + snprintf( dcache, sizeof(dcache), "dcache%lu", args->tile_idx ); + + fd_quic_t * quic = fd_quic_join( fd_wksp_pod_map( args->tile_pod, "quic" ) ); + if( FD_UNLIKELY( !quic ) ) FD_LOG_ERR(( "fd_quic_join failed" )); + ushort legacy_transaction_port = initialize_quic( &quic->config, args->tile_pod ); + + ulong xsk_aio_cnt = 1; + fd_xsk_aio_t * xsk_aio[2] = { fd_xsk_aio_join( fd_wksp_pod_map( args->tile_pod, "xsk_aio" ), args->xsk ), NULL }; + if( FD_UNLIKELY( args->lo_xsk ) ) { + xsk_aio[1] = fd_xsk_aio_join( fd_wksp_pod_map( args->tile_pod, "lo_xsk_aio" ), args->lo_xsk ); + xsk_aio_cnt += 1; + } + + fd_frag_meta_t * mcache = fd_mcache_join( fd_wksp_pod_map( args->out_pod, _mcache ) ); + if( FD_UNLIKELY( !mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); + ulong depth = fd_mcache_depth( mcache ); + + fd_rng_t _rng[ 1 ]; + fd_quic_tile( fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ), + (ulong)args->pid, + quic, + legacy_transaction_port, + xsk_aio_cnt, + xsk_aio, + mcache, + fd_dcache_join( fd_wksp_pod_map( args->out_pod, dcache ) ), + 0, + 0, + fd_rng_join( fd_rng_new( _rng, 0, 0UL ) ), + fd_alloca( FD_QUIC_TILE_SCRATCH_ALIGN, fd_quic_tile_scratch_footprint( depth, 0, 1 ) ) ); +} + +static long allow_syscalls[] = { + __NR_write, /* logging */ + __NR_fsync, /* logging, WARNING and above fsync immediately */ + __NR_getpid, /* OpenSSL RAND_bytes checks pid, temporarily used as part of quic_init to generate a certificate */ + __NR_getrandom, /* OpenSSL RAND_bytes reads getrandom, temporarily used as part of quic_init to generate a certificate */ + __NR_madvise, /* OpenSSL SSL_do_handshake() uses an arena which eventually calls _rjem_je_pages_purge_forced */ + __NR_sendto, /* fd_xsk requires sendto */ + __NR_mmap, /* OpenSSL again... deep inside SSL_provide_quic_data() some jemalloc code calls mmap */ +}; + +static ulong +allow_fds( fd_tile_args_t * args, + ulong out_fds_sz, + int * out_fds ) { + if( FD_UNLIKELY( out_fds_sz < 4 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); + out_fds[ 0 ] = 2; /* stderr */ + out_fds[ 1 ] = 3; /* logfile */ + out_fds[ 2 ] = args->xsk->xsk_fd; + out_fds[ 3 ] = args->lo_xsk ? args->lo_xsk->xsk_fd : -1; + return args->lo_xsk ? 4 : 3; +} + +fd_tile_config_t quic = { + .name = "quic", + .in_wksp = NULL, + .out_wksp = "quic_verify", + .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), + .allow_syscalls = allow_syscalls, + .allow_fds = allow_fds, + .init = init, + .run = run, +}; diff --git a/src/app/fdctl/run/tiles/verify.c b/src/app/fdctl/run/tiles/verify.c new file mode 100644 index 0000000000..51e8c1c793 --- /dev/null +++ b/src/app/fdctl/run/tiles/verify.c @@ -0,0 +1,69 @@ +#include "../../fdctl.h" +#include "../run.h" + +#include "../../../../disco/fd_disco.h" + +#include + +static void +init( fd_tile_args_t * args ) { + (void)args; + + /* calling fd_tempo_tick_per_ns requires nanosleep, it is cached with + a FD_ONCE */ + fd_tempo_tick_per_ns( NULL ); +} + +static void +run( fd_tile_args_t * args ) { + char mcache[32], fseq[32], dcache[32]; + snprintf( mcache, sizeof(mcache), "mcache%lu", args->tile_idx ); + snprintf( fseq, sizeof(fseq), "fseq%lu", args->tile_idx ); + snprintf( dcache, sizeof(dcache), "dcache%lu", args->tile_idx ); + + fd_sha512_t _sha[1]; + fd_rng_t _rng[1]; + fd_verify_tile( fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ), + (ulong)args->pid, + 1, + (const fd_frag_meta_t **)&(fd_frag_meta_t*){ fd_mcache_join( fd_wksp_pod_map( args->in_pod, mcache ) ) }, + &(ulong*){ fd_fseq_join( fd_wksp_pod_map( args->in_pod, fseq ) ) }, + (const uchar**)&(uchar*){ fd_dcache_join( fd_wksp_pod_map( args->in_pod, dcache ) ) }, + fd_sha512_join( fd_sha512_new( _sha ) ), + fd_tcache_join( fd_tcache_new( fd_wksp_alloc_laddr( fd_wksp_containing( args->tile_pod ), FD_TCACHE_ALIGN, FD_TCACHE_FOOTPRINT( 16UL, 64UL ), 1UL ), 16UL, 64UL ) ), + fd_mcache_join( fd_wksp_pod_map( args->out_pod, mcache ) ), + fd_dcache_join( fd_wksp_pod_map( args->out_pod, dcache ) ), + 1, + &(ulong*){ fd_fseq_join( fd_wksp_pod_map( args->out_pod, fseq ) ) }, + 0, + 0, + fd_rng_join( fd_rng_new( _rng, 0, 0UL ) ), + fd_alloca( FD_VERIFY_TILE_SCRATCH_ALIGN, FD_VERIFY_TILE_SCRATCH_FOOTPRINT( 1UL, 1UL ) ) ); +} + +static long allow_syscalls[] = { + __NR_write, /* logging */ + __NR_fsync, /* logging, WARNING and above fsync immediately */ +}; + +static ulong +allow_fds( fd_tile_args_t * args, + ulong out_fds_sz, + int * out_fds ) { + (void)args; + if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); + out_fds[ 0 ] = 2; /* stderr */ + out_fds[ 1 ] = 3; /* logfile */ + return 2; +} + +fd_tile_config_t verify = { + .name = "verify", + .in_wksp = "quic_verify", + .out_wksp = "verify_dedup", + .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), + .allow_syscalls = allow_syscalls, + .allow_fds = allow_fds, + .init = init, + .run = run, +}; diff --git a/src/app/fddev/Local.mk b/src/app/fddev/Local.mk index 71e01af87b..5ab21baa4f 100644 --- a/src/app/fddev/Local.mk +++ b/src/app/fddev/Local.mk @@ -4,7 +4,7 @@ ifdef FD_HAS_X86 ifdef FD_HAS_DOUBLE .PHONY: fddev -$(call make-bin-rust,fddev,main dev dev1 txn configure/netns configure/genesis,fd_fdctl fd_frank fd_disco fd_ballet fd_tango fd_util fd_quic solana_validator_fd) +$(call make-bin-rust,fddev,main dev dev1 txn configure/netns configure/genesis,fd_fdctl fd_disco fd_ballet fd_tango fd_util fd_quic solana_validator_fd) ifeq (run,$(firstword $(MAKECMDGOALS))) RUN_ARGS := $(wordlist 2,$(words $(MAKECMDGOALS)),$(MAKECMDGOALS)) diff --git a/src/app/fddev/dev.c b/src/app/fddev/dev.c index 37e4bc8407..a674bf3a1a 100644 --- a/src/app/fddev/dev.c +++ b/src/app/fddev/dev.c @@ -2,7 +2,7 @@ #include "fddev.h" #include "../fdctl/configure/configure.h" -#include "../fdctl/run.h" +#include "../fdctl/run/run.h" #include #include diff --git a/src/app/fddev/dev1.c b/src/app/fddev/dev1.c index 3d5110bd72..65bba1329c 100644 --- a/src/app/fddev/dev1.c +++ b/src/app/fddev/dev1.c @@ -2,7 +2,7 @@ #include "fddev.h" #include "../fdctl/configure/configure.h" -#include "../fdctl/run.h" +#include "../fdctl/run/run.h" #include #include @@ -15,7 +15,6 @@ typedef enum { DEV1_VERIFY, DEV1_QUIC, DEV1_BANK, - DEV1_FORWARD, DEV1_SOLANA, } tile_t; @@ -31,7 +30,6 @@ dev1_cmd_args( int * pargc, else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "verify" ) ) ) args->run1.tile = DEV1_VERIFY; else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "quic" ) ) ) args->run1.tile = DEV1_QUIC; else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "bank" ) ) ) args->run1.tile = DEV1_BANK; - else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "forward" ) ) ) args->run1.tile = DEV1_FORWARD; else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "labs" ) ) ) args->run1.tile = DEV1_SOLANA; else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "solana" ) ) ) args->run1.tile = DEV1_SOLANA; else if( FD_LIKELY( !strcmp( *pargv[ 0 ], "solana-labs" ) ) ) args->run1.tile = DEV1_SOLANA; @@ -72,15 +70,17 @@ dev1_cmd_fn( args_t * args, }; switch( args->run1.tile ) { - case DEV1_PACK: tile_args.tile = &frank_pack; break; - case DEV1_DEDUP: tile_args.tile = &frank_dedup; break; - case DEV1_VERIFY: tile_args.tile = &frank_verify; break; - case DEV1_QUIC: tile_args.tile = &frank_quic; break; - case DEV1_FORWARD: tile_args.tile = &frank_forward; break; + case DEV1_PACK: tile_args.tile = &pack; break; + case DEV1_DEDUP: tile_args.tile = &dedup; break; + case DEV1_VERIFY: tile_args.tile = &verify; break; + case DEV1_QUIC: tile_args.tile = &quic; break; case DEV1_SOLANA: break; default: FD_LOG_ERR(( "unknown tile %d", args->run1.tile )); } + if( FD_UNLIKELY( close( 0 ) ) ) FD_LOG_ERR(( "close(0) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + if( FD_UNLIKELY( close( 1 ) ) ) FD_LOG_ERR(( "close(1) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + int result; if( args->run1.tile == DEV1_SOLANA ) result = solana_labs_main( config ); else result = tile_main( &tile_args ); diff --git a/src/app/frank/Local.mk b/src/app/frank/Local.mk deleted file mode 100644 index 942866fc96..0000000000 --- a/src/app/frank/Local.mk +++ /dev/null @@ -1,8 +0,0 @@ -ifdef FD_HAS_HOSTED -ifdef FD_HAS_ALLOCA -ifdef FD_HAS_X86 -$(call add-hdrs,fd_frank.h) -$(call add-objs,fd_frank_verify fd_frank_dedup fd_frank_quic fd_frank_pack fd_frank_forward,fd_frank) -endif -endif -endif diff --git a/src/app/frank/fd_frank.h b/src/app/frank/fd_frank.h deleted file mode 100644 index 6a5e89720a..0000000000 --- a/src/app/frank/fd_frank.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef HEADER_fd_src_app_frank_fd_frank_h -#define HEADER_fd_src_app_frank_fd_frank_h - -#include "../../disco/fd_disco.h" -#include "../../ballet/fd_ballet.h" /* FIXME: CONSIDER HAVING THIS IN DISCO_BASE */ -#include "../../tango/xdp/fd_xsk.h" - -/* FD_FRANK_CNC_DIAG_* are FD_CNC_DIAG_* style diagnostics and thus the - same considerations apply. Further they are harmonized with the - standard FD_CNC_DIAG_*. Specifically: - - IN_BACKP is same as standard IN_BACKP - - BACKP_CNT is same as standard BACKP_CNT - - {HA,SV}_FILT_{CNT,SZ} is frank specific and the number of times a - transaction was dropped by a verify tile due to failing signature - verification. */ - -#define FD_FRANK_CNC_DIAG_IN_BACKP FD_CNC_DIAG_IN_BACKP /* ==0 */ -#define FD_FRANK_CNC_DIAG_BACKP_CNT FD_CNC_DIAG_BACKP_CNT /* ==1 */ -#define FD_FRANK_CNC_DIAG_HA_FILT_CNT (2UL) /* updated by verify tile, frequently in ha situations, never o.w. */ -#define FD_FRANK_CNC_DIAG_HA_FILT_SZ (3UL) /* " */ -#define FD_FRANK_CNC_DIAG_SV_FILT_CNT (4UL) /* ", ideally never */ -#define FD_FRANK_CNC_DIAG_SV_FILT_SZ (5UL) /* " */ - -#define FD_FRANK_CNC_DIAG_PID (128UL) - -typedef struct { - int pid; - char * app_name; - char * tile_name; - ulong tile_idx; - ulong idx; - uchar const * tile_pod; - uchar const * in_pod; - uchar const * out_pod; - uchar const * extra_pod; - fd_xsk_t * xsk; - fd_xsk_t * lo_xsk; - double tick_per_ns; -} fd_frank_args_t; - -typedef struct { - char * name; - char * in_wksp; - char * out_wksp; - char * extra_wksp; - ushort allow_syscalls_sz; - long * allow_syscalls; - ulong (*allow_fds)( fd_frank_args_t * args, ulong out_fds_sz, int * out_fds ); - void (*init)( fd_frank_args_t * args ); - void (*run )( fd_frank_args_t * args ); -} fd_frank_task_t; - -extern fd_frank_task_t frank_verify; -extern fd_frank_task_t frank_dedup; -extern fd_frank_task_t frank_quic; -extern fd_frank_task_t frank_pack; -extern fd_frank_task_t frank_forward; - -#endif /* HEADER_fd_src_app_frank_fd_frank_h */ diff --git a/src/app/frank/fd_frank_dedup.c b/src/app/frank/fd_frank_dedup.c deleted file mode 100644 index 38be9d1877..0000000000 --- a/src/app/frank/fd_frank_dedup.c +++ /dev/null @@ -1,110 +0,0 @@ -#include "fd_frank.h" - -#include -#include - -static void -run( fd_frank_args_t * args ) { - FD_LOG_INFO(( "dedup init" )); - - FD_LOG_INFO(( "joining cnc" )); - fd_cnc_t * cnc = fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ); - if( FD_UNLIKELY( !cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) FD_LOG_ERR(( "cnc not in boot state" )); - - ulong * cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - cnc_diag[ FD_FRANK_CNC_DIAG_PID ] = (ulong)args->pid; - - ulong in_cnt = fd_pod_query_ulong( args->in_pod, "cnt", 0 ); - if( FD_UNLIKELY( !in_cnt ) ) FD_LOG_ERR(( "cnt is zero" )); - FD_LOG_INFO(( "%lu verify found", in_cnt )); - - /* Join the IPC objects needed this tile instance */ - - fd_frag_meta_t const ** in_mcache = (fd_frag_meta_t const **) - fd_alloca( alignof(fd_frag_meta_t const *), sizeof(fd_frag_meta_t const *)*in_cnt ); - if( FD_UNLIKELY( !in_mcache ) ) FD_LOG_ERR(( "fd_alloca failed" )); - - ulong ** in_fseq = (ulong **)fd_alloca( alignof(ulong *), sizeof(ulong *)*in_cnt ); - if( FD_UNLIKELY( !in_fseq ) ) FD_LOG_ERR(( "fd_alloca failed" )); - - ulong in_idx = 0UL; - for( ulong i=0; iin_pod, path ) ); - if( FD_UNLIKELY( !in_mcache[ in_idx ] ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - - snprintf( path, 32, "fseq%lu", i ); - FD_LOG_INFO(( "joining fseq%lu", i )); - in_fseq[ in_idx ] = fd_fseq_join( fd_wksp_pod_map( args->in_pod, path ) ); - if( FD_UNLIKELY( !in_fseq[ in_idx ] ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - - in_idx++; - } - - FD_LOG_INFO(( "joining tcache" )); - fd_tcache_t * tcache = fd_tcache_join( fd_wksp_pod_map( args->tile_pod, "tcache" ) ); - if( FD_UNLIKELY( !tcache ) ) FD_LOG_ERR(( "fd_tcache_join failed" )); - - FD_LOG_INFO(( "joining mcache" )); - fd_frag_meta_t * mcache = fd_mcache_join( fd_wksp_pod_map( args->out_pod, "mcache" ) ); - if( FD_UNLIKELY( !mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - - FD_LOG_INFO(( "joining fseq" )); - ulong * out_fseq = fd_fseq_join( fd_wksp_pod_map( args->out_pod, "fseq" ) ); - if( FD_UNLIKELY( !out_fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - - /* Setup local objects used by this tile */ - - ulong cr_max = fd_pod_query_ulong( args->tile_pod, "cr_max", 0UL ); /* 0 <> pick reasonable default */ - long lazy = fd_pod_query_long ( args->tile_pod, "lazy", 0L ); /* <=0 <> pick reasonable default */ - FD_LOG_INFO(( "configuring flow control (cr_max %lu lazy %li)", cr_max, lazy )); - - uint seed = fd_pod_query_uint( args->tile_pod, "seed", (uint)fd_tile_id() ); /* use app tile_id as default */ - FD_LOG_INFO(( "creating rng (seed %u)", seed )); - fd_rng_t _rng[ 1 ]; - fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, seed, 0UL ) ); - if( FD_UNLIKELY( !rng ) ) FD_LOG_ERR(( "fd_rng_join failed" )); - - FD_LOG_INFO(( "creating scratch" )); - ulong footprint = fd_dedup_tile_scratch_footprint( in_cnt, 1UL ); - if( FD_UNLIKELY( !footprint ) ) FD_LOG_ERR(( "fd_dedup_tile_scratch_footprint failed" )); - void * scratch = fd_alloca( FD_DEDUP_TILE_SCRATCH_ALIGN, footprint ); - if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "fd_alloca failed" )); - - /* Start deduping */ - - FD_LOG_INFO(( "dedup run" )); - int err = fd_dedup_tile( cnc, in_cnt, in_mcache, in_fseq, tcache, mcache, 1UL, &out_fseq, cr_max, lazy, rng, scratch, args->tick_per_ns ); - if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_dedup_tile failed (%i)", err )); -} - -static long allow_syscalls[] = { - __NR_write, /* logging */ - __NR_fsync, /* logging, WARNING and above fsync immediately */ -}; - -static ulong -allow_fds( fd_frank_args_t * args, - ulong out_fds_sz, - int * out_fds ) { - (void)args; - if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); - out_fds[ 0 ] = 2; /* stderr */ - out_fds[ 1 ] = 3; /* logfile */ - return 2; -} - -fd_frank_task_t frank_dedup = { - .name = "dedup", - .in_wksp = "verify_dedup", - .out_wksp = "dedup_pack", - .extra_wksp = NULL, - .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), - .allow_syscalls = allow_syscalls, - .allow_fds = allow_fds, - .init = NULL, - .run = run, -}; diff --git a/src/app/frank/fd_frank_forward.c b/src/app/frank/fd_frank_forward.c deleted file mode 100644 index 3cea6e9aee..0000000000 --- a/src/app/frank/fd_frank_forward.c +++ /dev/null @@ -1,158 +0,0 @@ -#include "fd_frank.h" -#include - -static void -run( fd_frank_args_t * args ) { - /* Join the IPC objects needed this tile instance */ - - FD_LOG_INFO(( "joining cnc" )); - fd_cnc_t * cnc = fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ); - if( FD_UNLIKELY( !cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) FD_LOG_ERR(( "cnc not in boot state" )); - - ulong * cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - cnc_diag[ FD_FRANK_CNC_DIAG_PID ] = (ulong)args->pid; - - FD_LOG_INFO(( "joining mcache" )); - fd_frag_meta_t const * mcache = fd_mcache_join( fd_wksp_pod_map( args->in_pod, "mcache" ) ); - if( FD_UNLIKELY( !mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - ulong depth = fd_mcache_depth( mcache ); - ulong const * sync = fd_mcache_seq_laddr_const( mcache ); - ulong seq = fd_mcache_seq_query( sync ); - - fd_frag_meta_t const * mline = mcache + fd_mcache_line_idx( seq, depth ); - - FD_LOG_INFO(( "joining dcache" )); - uchar * dcache = fd_dcache_join( fd_wksp_pod_map( args->in_pod, "dcache" ) ); - fd_wksp_t * wksp = fd_wksp_containing( dcache ); - if( FD_UNLIKELY( !wksp ) ) FD_LOG_ERR(( "fd_wksp_containing failed" )); - - FD_LOG_INFO(( "joining fseq" )); - ulong * fseq = fd_fseq_join( fd_wksp_pod_map( args->in_pod, "fseq" ) ); - if( FD_UNLIKELY( !fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - /* Hook up to this pack's flow control diagnostics (will be stored in - the pack's fseq) */ - ulong * fseq_diag = (ulong *)fd_fseq_app_laddr( fseq ); - if( FD_UNLIKELY( !fseq_diag ) ) FD_LOG_ERR(( "fd_cnc_app_laddr failed" )); - FD_COMPILER_MFENCE(); - fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ] = 0UL; - fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ] = 0UL; - fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ] = 0UL; - fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ] = 0UL; - FD_COMPILER_MFENCE(); - ulong accum_pub_cnt = 0UL; - ulong accum_pub_sz = 0UL; - ulong accum_ovrnp_cnt = 0UL; - ulong accum_ovrnr_cnt = 0UL; - - /* Setup local objects used by this tile */ - - long lazy = fd_pod_query_long( args->tile_pod, "lazy", 0L ); - FD_LOG_INFO(( "configuring flow control (lazy %li)", lazy )); - if( lazy<=0L ) lazy = fd_tempo_lazy_default( depth ); - FD_LOG_INFO(( "using lazy %li ns", lazy )); - ulong async_min = fd_tempo_async_min( lazy, 1UL /*event_cnt*/, (float)args->tick_per_ns ); - if( FD_UNLIKELY( !async_min ) ) FD_LOG_ERR(( "bad lazy" )); - - uint seed = (uint)fd_tile_id(); /* TODO: LML is this a good seed? */ - FD_LOG_INFO(( "creating rng (seed %u)", seed )); - fd_rng_t _rng[ 1 ]; - fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, seed, 0UL ) ); - if( FD_UNLIKELY( !rng ) ) FD_LOG_ERR(( "fd_rng_join failed" )); - - FD_LOG_INFO(( "forward run" )); - fd_cnc_signal( cnc, FD_CNC_SIGNAL_RUN ); - - long now = fd_tickcount(); - long then = now; /* Do housekeeping on first iteration of run loop */ - for(;;) { - - /* Do housekeeping at a low rate in the background */ - - if( FD_UNLIKELY( (now-then)>=0L ) ) { - - /* Send flow control credits */ - fd_fctl_rx_cr_return( fseq, seq ); - - /* Send diagnostic info */ - fd_cnc_heartbeat( cnc, now ); - FD_COMPILER_MFENCE(); - fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ] += accum_pub_cnt; - fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ] += accum_pub_sz; - fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ] += accum_ovrnp_cnt; - fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ] += accum_ovrnr_cnt; - FD_COMPILER_MFENCE(); - accum_pub_cnt += 1UL; - accum_pub_sz += 1UL; - accum_ovrnp_cnt += 1UL; - accum_ovrnr_cnt += 1UL; - - /* Receive command-and-control signals */ - ulong s = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_RUN ) ) { - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_HALT ) ) FD_LOG_ERR(( "Unexpected signal" )); - break; - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - } - - /* See if there are any transactions waiting to be forwarded */ - ulong seq_found = fd_frag_meta_seq_query( mline ); - long diff = fd_seq_diff( seq_found, seq ); - if( FD_UNLIKELY( diff ) ) { /* caught up or overrun, optimize for expected sequence number ready */ - if( FD_LIKELY( diff<0L ) ) { /* caught up */ - FD_SPIN_PAUSE(); - now = fd_tickcount(); - continue; - } - /* overrun by pack tile ... recover */ - accum_ovrnp_cnt++; - seq = seq_found; - /* can keep processing from the new seq */ - } - - now = fd_tickcount(); - - /* Check that we weren't overrun while processing */ - seq_found = fd_frag_meta_seq_query( mline ); - if( FD_UNLIKELY( fd_seq_ne( seq_found, seq ) ) ) { - accum_ovrnr_cnt++; - seq = seq_found; - continue; - } - - /* Wind up for the next iteration */ - seq = fd_seq_inc( seq, 1UL ); - mline = mcache + fd_mcache_line_idx( seq, depth ); - } -} - -static long allow_syscalls[] = { - __NR_write, /* logging */ - __NR_fsync, /* logging, WARNING and above fsync immediately */ -}; - -static ulong -allow_fds( fd_frank_args_t * args, - ulong out_fds_sz, - int * out_fds ) { - (void)args; - if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); - out_fds[ 0 ] = 2; /* stderr */ - out_fds[ 1 ] = 3; /* logfile */ - return 2; -} - -fd_frank_task_t frank_forward = { - .name = "forward", - .in_wksp = "pack_forward", - .out_wksp = NULL, - .extra_wksp = NULL, - .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), - .allow_syscalls = allow_syscalls, - .allow_fds = allow_fds, - .init = NULL, - .run = run, -}; diff --git a/src/app/frank/fd_frank_pack.c b/src/app/frank/fd_frank_pack.c deleted file mode 100644 index ab56bfb90c..0000000000 --- a/src/app/frank/fd_frank_pack.c +++ /dev/null @@ -1,372 +0,0 @@ -#include "fd_frank.h" - -#include "../../ballet/pack/fd_pack.h" - -#include -#include - -#define FD_PACK_TAG 0x17ac1C711eUL - -#define MAX_MICROBLOCK_SZ USHORT_MAX /* in bytes. Defined this way to - use the size field of mcache */ - -/* Helper struct containing all the state associated with one output */ -typedef struct { - fd_wksp_t * out_wksp; - fd_frag_meta_t * out_mcache; - uchar * out_dcache; - ulong * out_fseq; - ulong * out_sync; - ulong out_seq; - ulong out_chunk0; - ulong out_wmark; - ulong out_chunk; - ulong out_cr_avail; - ulong out_depth; - fd_fctl_t * out_fctl; - uchar _fctl_footprint[ FD_FCTL_FOOTPRINT( 1 ) ] __attribute__((aligned(FD_FCTL_ALIGN))); -} out_state; - - -#define FD_FRANK_PACK_MAX_OUT (16UL) /* About 1.5 kB on the stack */ - -static void -join_out( out_state * state, - uchar const * pod, - ulong suffix ) { - char path[ 32 ]; - - FD_LOG_INFO(( "joining mcache%lu", suffix )); - snprintf( path, sizeof( path ), "mcache%lu", suffix ); - fd_frag_meta_t * out_mcache = fd_mcache_join( fd_wksp_pod_map( pod, path ) ); - if( FD_UNLIKELY( !out_mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - - FD_LOG_INFO(( "joining dcache%lu", suffix )); - snprintf( path, sizeof( path ), "dcache%lu", suffix ); - uchar * out_dcache = fd_dcache_join( fd_wksp_pod_map( pod, path ) ); - if( FD_UNLIKELY( !out_dcache ) ) FD_LOG_ERR(( "fd_dcache_join failed" )); - - FD_LOG_INFO(( "joining fseq%lu", suffix )); - snprintf( path, sizeof( path ), "fseq%lu", suffix ); - ulong * out_fseq = fd_fseq_join( fd_wksp_pod_map( pod, path) ); - if( FD_UNLIKELY( !out_fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - - ulong * fseq_diag = (ulong *)fd_fseq_app_laddr( out_fseq ); - if( FD_UNLIKELY( !fseq_diag ) ) FD_LOG_ERR(( "fd_cnc_app_laddr failed" )); - - fd_wksp_t * wksp = fd_wksp_containing( out_dcache ); - - ulong * out_sync = fd_mcache_seq_laddr( out_mcache ); - ulong out_seq = fd_mcache_seq_query( out_sync ); - ulong out_chunk0 = fd_dcache_compact_chunk0( wksp, out_dcache ); - ulong out_wmark = fd_dcache_compact_wmark ( wksp, out_dcache, MAX_MICROBLOCK_SZ ); - ulong out_chunk = out_chunk0; - ulong out_cr_avail = 0UL; - ulong out_depth = fd_mcache_depth( out_mcache ); - - fd_fctl_t * out_fctl = fd_fctl_cfg_done( fd_fctl_cfg_rx_add( - fd_fctl_join( fd_fctl_new( state->_fctl_footprint, 1UL ) ), - out_depth, out_fseq, &fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ] ), - 1UL /*cr_burst*/, 0UL, 0UL, 0UL ); /* TODO: allow manual configuration of these? */ - - FD_LOG_INFO(( "using cr_burst %lu, cr_max %lu, cr_resume %lu, cr_refill %lu", - fd_fctl_cr_burst( out_fctl ), fd_fctl_cr_max( out_fctl ), fd_fctl_cr_resume( out_fctl ), fd_fctl_cr_refill( out_fctl ) )); - - state->out_wksp = wksp; - state->out_mcache = out_mcache; - state->out_dcache = out_dcache; - state->out_fseq = out_fseq; - state->out_sync = out_sync; - state->out_seq = out_seq; - state->out_chunk0 = out_chunk0; - state->out_wmark = out_wmark; - state->out_chunk = out_chunk; - state->out_cr_avail = out_cr_avail; - state->out_depth = out_depth; - state->out_fctl = out_fctl; -} - - -static void -run( fd_frank_args_t * args ) { - /* Join the IPC objects needed this tile instance */ - - FD_LOG_INFO(( "joining cnc" )); - fd_cnc_t * cnc = fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ); - if( FD_UNLIKELY( !cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) FD_LOG_ERR(( "cnc not in boot state" )); - - ulong * cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - cnc_diag[ FD_FRANK_CNC_DIAG_PID ] = (ulong)args->pid; - - FD_LOG_INFO(( "joining mcache" )); - fd_frag_meta_t const * mcache = fd_mcache_join( fd_wksp_pod_map( args->in_pod, "mcache" ) ); - if( FD_UNLIKELY( !mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - ulong depth = fd_mcache_depth( mcache ); - ulong const * sync = fd_mcache_seq_laddr_const( mcache ); - ulong seq = fd_mcache_seq_query( sync ); - - fd_frag_meta_t const * mline = mcache + fd_mcache_line_idx( seq, depth ); - - FD_LOG_INFO(( "joining dcache%lu", args->tile_idx )); - /* Note (chunks are referenced relative to the containing workspace - currently and there is just one workspace). */ - uchar * dcache = fd_dcache_join( fd_wksp_pod_map( args->extra_pod, "dcache0" ) ); - fd_wksp_t * wksp = fd_wksp_containing( dcache ); - if( FD_UNLIKELY( !wksp ) ) FD_LOG_ERR(( "fd_wksp_containing failed" )); - - FD_LOG_INFO(( "joining fseq" )); - ulong * fseq = fd_fseq_join( fd_wksp_pod_map( args->in_pod, "fseq" ) ); - if( FD_UNLIKELY( !fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - /* Hook up to this pack's flow control diagnostics (will be stored in - the pack's fseq) */ - ulong * fseq_diag = (ulong *)fd_fseq_app_laddr( fseq ); - if( FD_UNLIKELY( !fseq_diag ) ) FD_LOG_ERR(( "fd_cnc_app_laddr failed" )); - FD_COMPILER_MFENCE(); - fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ] = 0UL; - fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ] = 0UL; - fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ] = 0UL; - fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ] = 0UL; - FD_COMPILER_MFENCE(); - ulong accum_pub_cnt = 0UL; - ulong accum_pub_sz = 0UL; - ulong accum_ovrnp_cnt = 0UL; - ulong accum_ovrnr_cnt = 0UL; - - - ulong pack_depth = fd_pod_query_ulong( args->tile_pod, "depth", 0UL ); - if( FD_UNLIKELY( !pack_depth ) ) FD_LOG_ERR(( "pack.depth unset or set to zero" )); - - /* Should these be allocated with alloca instead? */ - out_state out[ FD_FRANK_PACK_MAX_OUT ]; - - /* FIXME: Plumb this through properly: */ - ulong bank_cnt = fd_pod_query_ulong( args->out_pod, "num_tiles", 0UL ); - if( FD_UNLIKELY( !bank_cnt ) ) FD_LOG_ERR(( "pack.num_tiles unset or set to zero" )); - if( FD_UNLIKELY( bank_cnt>FD_FRANK_PACK_MAX_OUT ) ) FD_LOG_ERR(( "pack tile connects to too many banking tiles" )); - - for( ulong i=0UL; iout_pod, i ); - - ulong max_txn_per_microblock = MAX_MICROBLOCK_SZ/sizeof(fd_txn_p_t); - - ulong pack_footprint = fd_pack_footprint( pack_depth, bank_cnt, max_txn_per_microblock ); - - ulong cus_per_microblock = 1500000UL; /* 1.5 M cost units, enough for 1 max size transaction */ - float vote_fraction = 0.75; - - /* Setup local objects used by this tile */ - - long lazy = fd_pod_query_long( args->tile_pod, "lazy", 0L ); - FD_LOG_INFO(( "configuring flow control (lazy %li)", lazy )); - if( lazy<=0L ) lazy = fd_tempo_lazy_default( depth ); - FD_LOG_INFO(( "using lazy %li ns", lazy )); - ulong async_min = fd_tempo_async_min( lazy, 1UL /*event_cnt*/, (float)args->tick_per_ns ); - if( FD_UNLIKELY( !async_min ) ) FD_LOG_ERR(( "bad lazy" )); - - uint seed = fd_pod_query_uint( args->tile_pod, "seed", (uint)fd_tile_id() ); /* use app tile_id as default */ - FD_LOG_INFO(( "creating rng (seed %u)", seed )); - fd_rng_t _rng[ 1 ]; - fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, seed, 0UL ) ); - if( FD_UNLIKELY( !rng ) ) FD_LOG_ERR(( "fd_rng_join failed" )); - - void * pack_laddr = fd_wksp_alloc_laddr( fd_wksp_containing( args->tile_pod ), fd_pack_align(), pack_footprint, FD_PACK_TAG ); - if( FD_UNLIKELY( !pack_laddr ) ) FD_LOG_ERR(( "allocating memory for pack object failed" )); - - - fd_pack_t * pack = fd_pack_join( fd_pack_new( pack_laddr, pack_depth, bank_cnt, max_txn_per_microblock, rng ) ); - - - FD_LOG_INFO(( "packing blocks of at most %lu transactions to %lu bank tiles", max_txn_per_microblock, bank_cnt )); - - const ulong block_duration_ns = 400UL*1000UL*1000UL; /* 400ms */ - - long block_duration_ticks = (long)(args->tick_per_ns * (double)block_duration_ns); - - int ctl_som = 1; - int ctl_eom = 1; - int ctl_err = 0; - ulong ctl = fd_frag_meta_ctl( args->tile_idx, ctl_som, ctl_eom, ctl_err ); - /* Start packing */ - - - FD_LOG_INFO(( "pack run" )); - fd_cnc_signal( cnc, FD_CNC_SIGNAL_RUN ); - - long now = fd_tickcount(); - long then = now; /* Do housekeeping on first iteration of run loop */ - long block_end = now + block_duration_ticks; - for(;;) { - - /* Do housekeeping at a low rate in the background */ - - if( FD_UNLIKELY( (now-then)>=0L ) ) { - - /* Send flow control credits */ - fd_fctl_rx_cr_return( fseq, seq ); - - /* TODO: It's not clear what the best way to do this is. Should - we update them all in one housekeeping loop, or do like other - parts of the code and update a random one each housekeeping - loop? */ - for( ulong i=0UL; iout_sync, o->out_seq ); - - /* Receive flow control credits */ - o->out_cr_avail = fd_fctl_tx_cr_update( o->out_fctl, o->out_cr_avail, o->out_seq ); - } - - /* Send diagnostic info */ - fd_cnc_heartbeat( cnc, now ); - FD_COMPILER_MFENCE(); - fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ] += accum_pub_cnt; - fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ] += accum_pub_sz; - fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ] += accum_ovrnp_cnt; - fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ] += accum_ovrnr_cnt; - FD_COMPILER_MFENCE(); - accum_pub_cnt = 0UL; - accum_pub_sz = 0UL; - accum_ovrnp_cnt = 0UL; - accum_ovrnr_cnt = 0UL; - - /* Receive command-and-control signals */ - ulong s = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_RUN ) ) { - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_HALT ) ) FD_LOG_ERR(( "Unexpected signal" )); - break; - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - } - - /* Are we ready to end the block? */ - if( FD_UNLIKELY( (now-block_end)>=0L ) ) { - fd_pack_end_block( pack ); - block_end += block_duration_ticks; - } - - /* Is it time to schedule the next microblock? */ - /* for each banking thread, if it has credits */ - for( ulong i=0UL; iout_cr_avail>0UL ) ) { /* optimize for the case we send a microblock */ - void * microblock_dst = fd_chunk_to_laddr( o->out_wksp, o->out_chunk ); - fd_pack_microblock_complete( pack, i ); - ulong schedule_cnt = fd_pack_schedule_next_microblock( pack, cus_per_microblock, vote_fraction, i, microblock_dst ); - if( FD_LIKELY( schedule_cnt ) ) { - ulong tspub = (ulong)fd_frag_meta_ts_comp( fd_tickcount() ); - ulong chunk = o->out_chunk; - ulong sig = 0UL; - ulong msg_sz = schedule_cnt*sizeof(fd_txn_p_t); - - fd_mcache_publish( o->out_mcache, o->out_depth, o->out_seq, sig, chunk, msg_sz, ctl, 0UL, tspub ); - - o->out_chunk = fd_dcache_compact_next( o->out_chunk, msg_sz, o->out_chunk0, o->out_wmark ); - o->out_seq = fd_seq_inc( o->out_seq, 1UL ); - o->out_cr_avail--; - } - } - } - /* Normally, we have an "else, do housekeeping next iteration" - branch here, but because we're using extremely short queues, we - actually expect to spend a significant fraction of the time in - the "no transmit credits available" state. */ - - - /* See if there are any transactions waiting to be packed */ - ulong seq_found = fd_frag_meta_seq_query( mline ); - long diff = fd_seq_diff( seq_found, seq ); - if( FD_UNLIKELY( diff ) ) { /* caught up or overrun, optimize for expected sequence number ready */ - if( FD_LIKELY( diff<0L ) ) { /* caught up */ - FD_SPIN_PAUSE(); - now = fd_tickcount(); - continue; - } - /* overrun by dedup tile ... recover */ - accum_ovrnp_cnt++; - seq = seq_found; - /* can keep processing from the new seq */ - } - - now = fd_tickcount(); - - /* At this point, we have started receiving frag seq with details in - mline at time now. Speculatively processs it here. */ - - /* Speculative pack operations */ - fd_txn_p_t * slot = fd_pack_insert_txn_init( pack ); - - ulong sz = (ulong)mline->sz; - uchar const * dcache_entry = fd_chunk_to_laddr_const( wksp, mline->chunk ); - ulong mline_sig = mline->sig; - /* Assume that the dcache entry is: - Payload ....... (payload_sz bytes) - 0 or 1 byte of padding (since alignof(fd_txn) is 2) - fd_txn ....... (size computed by fd_txn_footprint) - payload_sz (2B) - mline->sz includes all three fields and the padding */ - ulong payload_sz = *(ushort*)(dcache_entry + sz - sizeof(ushort)); - uchar const * payload = dcache_entry; - fd_txn_t const * txn = (fd_txn_t const *)( dcache_entry + fd_ulong_align_up( payload_sz, 2UL ) ); - fd_memcpy( slot->payload, payload, payload_sz ); - fd_memcpy( TXN(slot), txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) ); - slot->payload_sz = payload_sz; - slot->meta = mline_sig; - -#if DETAILED_LOGGING - FD_LOG_NOTICE(( "Pack got a packet. Payload size: %lu, txn footprint: %lu", payload_sz, - fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) - )); -#endif - - /* Check that we weren't overrun while processing */ - seq_found = fd_frag_meta_seq_query( mline ); - if( FD_UNLIKELY( fd_seq_ne( seq_found, seq ) ) ) { - fd_pack_insert_txn_cancel( pack, slot ); - accum_ovrnr_cnt++; - seq = seq_found; - continue; - } - - /* Non-speculative pack operations */ - accum_pub_cnt++; - accum_pub_sz += sz; - - fd_pack_insert_txn_fini( pack, slot ); - - /* Wind up for the next iteration */ - seq = fd_seq_inc( seq, 1UL ); - mline = mcache + fd_mcache_line_idx( seq, depth ); - } -} - -static long allow_syscalls[] = { - __NR_write, /* logging */ - __NR_fsync, /* logging, WARNING and above fsync immediately */ -}; - -static ulong -allow_fds( fd_frank_args_t * args, - ulong out_fds_sz, - int * out_fds ) { - (void)args; - if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); - out_fds[ 0 ] = 2; /* stderr */ - out_fds[ 1 ] = 3; /* logfile */ - return 2; -} - -fd_frank_task_t frank_pack = { - .name = "pack", - .in_wksp = "dedup_pack", - .out_wksp = "pack_bank", - .extra_wksp = "tpu_txn_data", - .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), - .allow_syscalls = allow_syscalls, - .allow_fds = allow_fds, - .init = NULL, - .run = run, -}; diff --git a/src/app/frank/fd_frank_quic.c b/src/app/frank/fd_frank_quic.c deleted file mode 100644 index fb863dc847..0000000000 --- a/src/app/frank/fd_frank_quic.c +++ /dev/null @@ -1,274 +0,0 @@ -#include "fd_frank.h" - -#include "../../disco/quic/fd_quic.h" -#include "../../tango/xdp/fd_xdp.h" -#include "../../tango/xdp/fd_xsk_private.h" -#include "../../util/net/fd_eth.h" -#include "../../util/net/fd_ip4.h" - -#include - -#include -#include - -#include -#include - -static void -init( fd_frank_args_t * args ) { - FD_LOG_INFO(( "loading %s", "xsk" )); - args->xsk = fd_xsk_join( fd_wksp_pod_map( args->tile_pod, "xsk" ) ); - if( FD_UNLIKELY( !args->xsk ) ) FD_LOG_ERR(( "fd_xsk_join failed" )); - - args->lo_xsk = NULL; - if( FD_UNLIKELY( fd_pod_query_cstr( args->tile_pod, "lo_xsk", NULL ) ) ) { - FD_LOG_INFO(( "loading %s", "lo_xsk" )); - args->lo_xsk = fd_xsk_join( fd_wksp_pod_map( args->tile_pod, "lo_xsk" ) ); - if( FD_UNLIKELY( !args->lo_xsk ) ) FD_LOG_ERR(( "fd_xsk_join (lo) failed" )); - } - - /* call wallclock so glibc loads VDSO, which requires calling mmap while - privileged */ - fd_log_wallclock(); - - /* OpenSSL goes and tries to read files and allocate memory and - other dumb things on a thread local basis, so we need a special - initializer to do it before seccomp happens in the process. */ - ERR_STATE * state = ERR_get_state(); - if( FD_UNLIKELY( !state )) FD_LOG_ERR(( "ERR_get_state failed" )); - if( FD_UNLIKELY( !OPENSSL_init_ssl( OPENSSL_INIT_LOAD_SSL_STRINGS , NULL ) ) ) - FD_LOG_ERR(( "OPENSSL_init_ssl failed" )); - if( FD_UNLIKELY( !OPENSSL_init_crypto( OPENSSL_INIT_LOAD_CRYPTO_STRINGS | OPENSSL_INIT_NO_LOAD_CONFIG , NULL ) ) ) - FD_LOG_ERR(( "OPENSSL_init_crypto failed" )); -} - -struct fd_quic_tpu_ctx; - -struct root_aio_ctx { - ushort transaction_listen_port; - ushort quic_transaction_listen_port; - - const fd_aio_t * quic_aio; - void (*transaction_callback)( struct fd_quic_tpu_ctx * ctx, uchar const * packet, uint packet_sz ); -}; - -static int -root_aio_net_rx( void * ctx, - fd_aio_pkt_info_t const * batch, - ulong batch_cnt, - ulong * opt_batch_idx, - int flush ) { - struct root_aio_ctx * root_ctx = ctx; - - for( ulong i=0; i packet_end ) ) continue; - - /* Extract IP dest addr and UDP dest port */ - ulong ip_dstaddr = *(uint *)( iphdr+16UL ); - (void)ip_dstaddr; - ushort udp_dstport = *(ushort *)( udp+2UL ); - - uchar const * data = udp + 8U; - uint data_sz = (uint)(packet_end - data); - - ulong ignored; - if( FD_LIKELY( fd_ushort_bswap( udp_dstport ) == root_ctx->quic_transaction_listen_port ) ) - root_ctx->quic_aio->send_func( root_ctx->quic_aio->ctx, batch + i, 1, &ignored, flush ); - else if( FD_LIKELY( fd_ushort_bswap( udp_dstport ) == root_ctx->transaction_listen_port ) ) - root_ctx->transaction_callback( root_ctx->quic_aio->ctx, data, data_sz ); - else - FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. " - "Only ports %hu and %hu should be configured to forward packets. Do " - "you need to reload the XDP program?", - fd_ushort_bswap( udp_dstport ), root_ctx->transaction_listen_port, root_ctx->quic_transaction_listen_port )); - } - - /* the assumption here at present is that any packet that could not be processed - is simply dropped hence, all packets were consumed */ - if( FD_LIKELY( opt_batch_idx ) ) { - *opt_batch_idx = batch_cnt; - } - - return FD_AIO_SUCCESS; -} - -extern void -fd_quic_transaction_receive( struct fd_quic_tpu_ctx * ctx, - uchar const * packet, - uint packet_sz ); - -static void -run( fd_frank_args_t * args ) { - FD_LOG_INFO(( "quic.%lu init", args->idx )); - - /* Join the IPC objects needed by this tile instance */ - - FD_LOG_INFO(( "joining cnc" )); - fd_cnc_t * cnc = fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ); - if( FD_UNLIKELY( !cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) FD_LOG_ERR(( "cnc not in boot state" )); - - ulong * cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - cnc_diag[ FD_FRANK_CNC_DIAG_PID ] = (ulong)args->pid; - - FD_LOG_INFO(( "joining mcache%lu", args->tile_idx )); - char path[ 32 ]; - snprintf( path, sizeof(path), "mcache%lu", args->tile_idx ); - fd_frag_meta_t * mcache = fd_mcache_join( fd_wksp_pod_map( args->out_pod, path ) ); - if( FD_UNLIKELY( !mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - - FD_LOG_INFO(( "joining dcache" )); - snprintf( path, sizeof(path), "dcache%lu", args->tile_idx ); - uchar * dcache = fd_dcache_join( fd_wksp_pod_map( args->extra_pod, path ) ); - if( FD_UNLIKELY( !dcache ) ) FD_LOG_ERR(( "fd_dcache_join failed" )); - - FD_LOG_INFO(( "loading quic" )); - fd_quic_t * quic = fd_quic_join( fd_wksp_pod_map( args->tile_pod, "quic" ) ); - if( FD_UNLIKELY( !quic ) ) FD_LOG_ERR(( "fd_quic_join failed" )); - - FD_LOG_INFO(( "loading xsk_aio" )); - fd_xsk_aio_t * xsk_aio = fd_xsk_aio_join( fd_wksp_pod_map( args->tile_pod, "xsk_aio" ), args->xsk ); - if( FD_UNLIKELY( !xsk_aio ) ) FD_LOG_ERR(( "fd_xsk_aio_join failed" )); - - fd_xsk_aio_t * lo_xsk_aio = NULL; - if( FD_UNLIKELY( args->lo_xsk ) ) { - FD_LOG_INFO(( "loading lo xsk_aio" )); - lo_xsk_aio = fd_xsk_aio_join( fd_wksp_pod_map( args->tile_pod, "lo_xsk_aio" ), args->lo_xsk ); - if( FD_UNLIKELY( !lo_xsk_aio ) ) FD_LOG_ERR(( "fd_xsk_aio_join failed" )); - } - - /* Setup local objects used by this tile */ - - FD_LOG_INFO(( "configuring flow control" )); - ulong cr_max = fd_pod_query_ulong( args->tile_pod, "cr_max", 0UL ); - ulong cr_resume = fd_pod_query_ulong( args->tile_pod, "cr_resume", 0UL ); - ulong cr_refill = fd_pod_query_ulong( args->tile_pod, "cr_refill", 0UL ); - long lazy = fd_pod_query_long ( args->tile_pod, "lazy", 0L ); - FD_LOG_INFO(( "cr_max %lu", cr_max )); - FD_LOG_INFO(( "cr_resume %lu", cr_resume )); - FD_LOG_INFO(( "cr_refill %lu", cr_refill )); - FD_LOG_INFO(( "lazy %li", lazy )); - - uint seed = fd_pod_query_uint( args->tile_pod, "dedup.seed", (uint)fd_tile_id() ); /* use app tile_id as default */ - FD_LOG_INFO(( "creating rng (seed %u)", seed )); - fd_rng_t _rng[ 1 ]; - fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, seed, 0UL ) ); - if( FD_UNLIKELY( !rng ) ) FD_LOG_ERR(( "fd_rng_join failed" )); - - FD_LOG_INFO(( "creating scratch" )); - ulong footprint = fd_quic_tile_scratch_footprint( fd_mcache_depth( mcache ) ); - if( FD_UNLIKELY( !footprint ) ) FD_LOG_ERR(( "fd_quic_tile_scratch_footprint failed" )); - void * scratch = fd_alloca( FD_QUIC_TILE_SCRATCH_ALIGN, footprint ); - if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "fd_alloca failed" )); - - /* Configure QUIC server */ - - fd_quic_config_t * quic_cfg = &quic->config; - quic_cfg->role = FD_QUIC_ROLE_SERVER; - - char const * keylog_file = fd_pod_query_cstr( args->tile_pod, "keylog_file", NULL ); /* optional */ - - strncpy( quic_cfg->keylog_file, keylog_file ? keylog_file : "", FD_QUIC_CERT_PATH_LEN ); - - /* TODO read IP addresses from interface instead? */ - quic_cfg->net.ip_addr = fd_pod_query_uint( args->tile_pod, "ip_addr", 0 ); - if( FD_UNLIKELY( !quic_cfg->net.ip_addr ) ) FD_LOG_ERR(( "ip_addr not set" )); - - /* TODO read MAC address from interface instead? */ - const void * src_mac = fd_pod_query_buf( args->tile_pod, "src_mac_addr", NULL ); - if( FD_UNLIKELY( !src_mac ) ) FD_LOG_ERR(( "src_mac_addr not set" )); - fd_memcpy( quic_cfg->link.src_mac_addr, src_mac, 6 ); - - ushort transaction_listen_port = fd_pod_query_ushort( args->tile_pod, "transaction_listen_port", 0 ); - if( FD_UNLIKELY( !transaction_listen_port ) ) FD_LOG_ERR(( "transaction_listen_port not set" )); - - ushort quic_transaction_listen_port = fd_pod_query_ushort( args->tile_pod, "quic_transaction_listen_port", 0 ); - if( FD_UNLIKELY( !quic_transaction_listen_port ) ) FD_LOG_ERR(( "quic_transaction_listen_port not set" )); - quic_cfg->net.listen_udp_port = quic_transaction_listen_port; - - ulong idle_timeout_ms = fd_pod_query_ulong( args->tile_pod, "idle_timeout_ms", 0 ); - if( FD_UNLIKELY( !idle_timeout_ms ) ) FD_LOG_ERR(( "idle_timeout_ms not set" )); - quic_cfg->idle_timeout = idle_timeout_ms * 1000000UL; - - ulong initial_rx_max_stream_data = fd_pod_query_ulong( args->tile_pod, "initial_rx_max_stream_data", 1<<15 ); - if( FD_UNLIKELY( !initial_rx_max_stream_data ) ) FD_LOG_ERR(( "initial_rx_max_stream_data not set" )); - quic_cfg->initial_rx_max_stream_data = initial_rx_max_stream_data; - - /* Attach to XSK */ - - const fd_aio_t * quic_aio = fd_quic_get_aio_net_rx( quic ); - - struct root_aio_ctx root_ctx = { - .quic_aio = quic_aio, - .transaction_callback = fd_quic_transaction_receive, - .transaction_listen_port = transaction_listen_port, - .quic_transaction_listen_port = quic_transaction_listen_port, - }; - - fd_aio_t root_aio = { - .ctx = &root_ctx, - .send_func = root_aio_net_rx, - }; - - if( FD_UNLIKELY( lo_xsk_aio) ) fd_xsk_aio_set_rx( lo_xsk_aio, &root_aio ); - fd_xsk_aio_set_rx ( xsk_aio, &root_aio ); - fd_quic_set_aio_net_tx( quic, fd_xsk_aio_get_tx( xsk_aio ) ); - - /* Start serving */ - - FD_LOG_INFO(( "%s(%lu) run", args->tile_name, args->tile_idx )); - int err = fd_quic_tile( cnc, quic, xsk_aio, lo_xsk_aio, mcache, dcache, lazy, rng, scratch, args->tick_per_ns ); - if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_quic_tile failed (%i)", err )); -} - -static long allow_syscalls[] = { - __NR_write, /* logging */ - __NR_fsync, /* logging, WARNING and above fsync immediately */ - __NR_getpid, /* OpenSSL RAND_bytes checks pid, temporarily used as part of quic_init to generate a certificate */ - __NR_getrandom, /* OpenSSL RAND_bytes reads getrandom, temporarily used as part of quic_init to generate a certificate */ - __NR_madvise, /* OpenSSL SSL_do_handshake() uses an arena which eventually calls _rjem_je_pages_purge_forced */ - __NR_sendto, /* fd_xsk requires sendto */ - __NR_mmap, /* OpenSSL again... deep inside SSL_provide_quic_data() some jemalloc code calls mmap */ -}; - -static ulong -allow_fds( fd_frank_args_t * args, - ulong out_fds_sz, - int * out_fds ) { - if( FD_UNLIKELY( out_fds_sz < 4 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); - out_fds[ 0 ] = 2; /* stderr */ - out_fds[ 1 ] = 3; /* logfile */ - out_fds[ 2 ] = args->xsk->xsk_fd; - out_fds[ 3 ] = args->lo_xsk ? args->lo_xsk->xsk_fd : -1; - return args->lo_xsk ? 4 : 3; -} - -fd_frank_task_t frank_quic = { - .name = "quic", - .in_wksp = NULL, - .out_wksp = "quic_verify", - .extra_wksp = "tpu_txn_data", - .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), - .allow_syscalls = allow_syscalls, - .allow_fds = allow_fds, - .init = init, - .run = run, -}; diff --git a/src/app/frank/fd_frank_verify.c b/src/app/frank/fd_frank_verify.c deleted file mode 100644 index 0820969651..0000000000 --- a/src/app/frank/fd_frank_verify.c +++ /dev/null @@ -1,408 +0,0 @@ -#include "fd_frank.h" -#include "../../ballet/txn/fd_txn.h" - -#include - -#include -#include - -static void -run( fd_frank_args_t * args ) { - /* - * Join the IPC objects needed this tile instance - */ - - FD_LOG_INFO(( "joining cnc" )); - fd_cnc_t * cnc = fd_cnc_join( fd_wksp_pod_map( args->tile_pod, "cnc" ) ); - if( FD_UNLIKELY( !cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) FD_LOG_ERR(( "cnc not in boot state" )); - - ulong * cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - cnc_diag[ FD_FRANK_CNC_DIAG_PID ] = (ulong)args->pid; - - /* In IPC objects */ - FD_LOG_INFO(( "joining mcache%lu", args->tile_idx )); - char path[ 32 ]; - snprintf( path, sizeof(path), "mcache%lu", args->tile_idx ); - fd_frag_meta_t * vin_mcache = fd_mcache_join( fd_wksp_pod_map( args->in_pod, path ) ); - if( FD_UNLIKELY( !vin_mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - ulong vin_mcache_depth = fd_mcache_depth ( vin_mcache ); - ulong * vin_mcache_sync = fd_mcache_seq_laddr( vin_mcache ); - ulong vin_mcache_seq = fd_mcache_seq_query( vin_mcache_sync ); - fd_frag_meta_t const * vin_mline = vin_mcache + fd_mcache_line_idx( vin_mcache_seq, vin_mcache_depth ); - - FD_LOG_INFO(( "joining dcache%lu", args->tile_idx )); - snprintf( path, sizeof(path), "dcache%lu", args->tile_idx ); - uchar * vin_dcache = fd_dcache_join( fd_wksp_pod_map( args->extra_pod, path ) ); - if( FD_UNLIKELY( !vin_dcache ) ) FD_LOG_ERR(( "fd_dcache_join failed" )); - fd_wksp_t * vin_wksp = fd_wksp_containing( vin_dcache ); /* chunks are referenced relative to the containing workspace */ - if( FD_UNLIKELY( !vin_wksp ) ) FD_LOG_ERR(( "fd_wksp_containing failed" )); - - FD_LOG_INFO(( "joining fseq%lu", args->tile_idx )); - snprintf( path, sizeof(path), "fseq%lu", args->tile_idx ); - ulong * vin_fseq = fd_fseq_join( fd_wksp_pod_map( args->in_pod, path ) ); - if( FD_UNLIKELY( !vin_fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - ulong * vin_fseq_diag = (ulong *)fd_fseq_app_laddr( vin_fseq ); - if( FD_UNLIKELY( !vin_fseq_diag ) ) FD_LOG_ERR(( "fd_fseq_app_laddr failed" )); - FD_COMPILER_MFENCE(); - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ] ) = 0UL; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ] ) = 0UL; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_FILT_CNT ] ) = 0UL; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_FILT_SZ ] ) = 0UL; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ] ) = 0UL; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ] ) = 0UL; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ] ) = 0UL; /* Managed by the fctl */ - FD_COMPILER_MFENCE(); - ulong vin_accum_pub_cnt = 0UL; - ulong vin_accum_pub_sz = 0UL; - ulong vin_accum_ovrnp_cnt = 0UL; - ulong vin_accum_ovrnr_cnt = 0UL; - - /* Setup local objects used by this tile */ - - FD_LOG_INFO(( "configuring flow control" )); - ulong vin_cr_max = fd_pod_query_ulong( args->tile_pod, "cr_max", 0UL ); - ulong vin_cr_resume = fd_pod_query_ulong( args->tile_pod, "cr_resume", 0UL ); - ulong vin_cr_refill = fd_pod_query_ulong( args->tile_pod, "cr_refill", 0UL ); - long vin_lazy = fd_pod_query_long ( args->tile_pod, "lazy", 0L ); - FD_LOG_INFO(( "verifyin.%lu.cr_max %lu", args->tile_idx, vin_cr_max )); - FD_LOG_INFO(( "verifyin.%lu.cr_resume %lu", args->tile_idx, vin_cr_resume )); - FD_LOG_INFO(( "verifyin.%lu.cr_refill %lu", args->tile_idx, vin_cr_refill )); - FD_LOG_INFO(( "verifyin.%lu.lazy %li", args->tile_idx, vin_lazy )); - - fd_fctl_t * vin_fctl = fd_fctl_cfg_done( fd_fctl_cfg_rx_add( fd_fctl_join( fd_fctl_new( fd_alloca( FD_FCTL_ALIGN, - fd_fctl_footprint( 1UL ) ), - 1UL ) ), - vin_mcache_depth, vin_fseq, &vin_fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ] ), - 1UL /*cr_burst*/, vin_cr_max, vin_cr_resume, vin_cr_refill ); - if( FD_UNLIKELY( !vin_fctl ) ) FD_LOG_ERR(( "Unable to create flow control" )); - FD_LOG_INFO(( "using cr_burst %lu, cr_max %lu, cr_resume %lu, cr_refill %lu for verifyin %lu", - fd_fctl_cr_burst( vin_fctl ), fd_fctl_cr_max( vin_fctl ), fd_fctl_cr_resume( vin_fctl ), fd_fctl_cr_refill( vin_fctl ), args->tile_idx )); - - if( vin_lazy<=0L ) vin_lazy = fd_tempo_lazy_default( vin_mcache_depth ); - FD_LOG_INFO(( "using lazy %li ns", vin_lazy )); - ulong vin_async_min = fd_tempo_async_min( vin_lazy, 1UL /*event_cnt*/, (float)fd_tempo_tick_per_ns( NULL ) ); - if( FD_UNLIKELY( !vin_async_min ) ) FD_LOG_ERR(( "bad vin_lazy" )); - - /* Out IPC objects */ - int in_backp = 1; - - FD_COMPILER_MFENCE(); - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ] ) = 1UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ] ) = 0UL; - FD_COMPILER_MFENCE(); - - FD_LOG_INFO(( "joining mcache%lu", args->tile_idx )); - snprintf( path, sizeof(path), "mcache%lu", args->tile_idx ); - fd_frag_meta_t * mcache = fd_mcache_join( fd_wksp_pod_map( args->out_pod, path ) ); - if( FD_UNLIKELY( !mcache ) ) FD_LOG_ERR(( "fd_mcache_join failed" )); - ulong depth = fd_mcache_depth( mcache ); - ulong * sync = fd_mcache_seq_laddr( mcache ); - ulong seq = fd_mcache_seq_query( sync ); - - FD_LOG_INFO(( "joining fseq%lu", args->tile_idx )); - snprintf( path, sizeof(path), "fseq%lu", args->tile_idx ); - ulong * fseq = fd_fseq_join( fd_wksp_pod_map( args->out_pod, path ) ); - if( FD_UNLIKELY( !fseq ) ) FD_LOG_ERR(( "fd_fseq_join failed" )); - ulong * fseq_diag = (ulong *)fd_fseq_app_laddr( fseq ); - if( FD_UNLIKELY( !fseq_diag ) ) FD_LOG_ERR(( "fd_fseq_app_laddr failed" )); - FD_VOLATILE( fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ] ) = 0UL; /* Managed by the fctl */ - - /* Setup local objects used by this tile */ - - FD_LOG_INFO(( "configuring flow control" )); - ulong cr_max = fd_pod_query_ulong( args->tile_pod, "cr_max", 0UL ); - ulong cr_resume = fd_pod_query_ulong( args->tile_pod, "cr_resume", 0UL ); - ulong cr_refill = fd_pod_query_ulong( args->tile_pod, "cr_refill", 0UL ); - long lazy = fd_pod_query_long ( args->tile_pod, "lazy", 0L ); - FD_LOG_INFO(( "cr_max %lu", cr_max )); - FD_LOG_INFO(( "cr_resume %lu", cr_resume )); - FD_LOG_INFO(( "cr_refill %lu", cr_refill )); - FD_LOG_INFO(( "lazy %li", lazy )); - - fd_fctl_t * fctl = fd_fctl_cfg_done( fd_fctl_cfg_rx_add( fd_fctl_join( fd_fctl_new( fd_alloca( FD_FCTL_ALIGN, - fd_fctl_footprint( 1UL ) ), - 1UL ) ), - depth, fseq, &fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ] ), - 1UL /*cr_burst*/, cr_max, cr_resume, cr_refill ); - if( FD_UNLIKELY( !fctl ) ) FD_LOG_ERR(( "Unable to create flow control" )); - FD_LOG_INFO(( "using cr_burst %lu, cr_max %lu, cr_resume %lu, cr_refill %lu", - fd_fctl_cr_burst( fctl ), fd_fctl_cr_max( fctl ), fd_fctl_cr_resume( fctl ), fd_fctl_cr_refill( fctl ) )); - - ulong cr_avail = 0UL; - - if( lazy<=0L ) lazy = fd_tempo_lazy_default( depth ); - FD_LOG_INFO(( "using lazy %li ns", lazy )); - ulong async_min = fd_tempo_async_min( lazy, 1UL /*event_cnt*/, (float)args->tick_per_ns ); - if( FD_UNLIKELY( !async_min ) ) FD_LOG_ERR(( "bad lazy" )); - - uint seed = fd_pod_query_uint( args->tile_pod, "seed", (uint)fd_tile_id() ); /* use app tile_id as default */ - FD_LOG_INFO(( "creating rng (seed %u)", seed )); - fd_rng_t _rng[ 1 ]; - fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, seed, 0UL ) ); - if( FD_UNLIKELY( !rng ) ) FD_LOG_ERR(( "fd_rng_join failed" )); - - /* FIXME: PROBABLY SHOULD PUT THIS IN WORKSPACE */ -# define TCACHE_DEPTH (16UL) /* Should be ~1/2-1/4 MAP_CNT */ -# define TCACHE_MAP_CNT (64UL) /* Power of two */ - uchar tcache_mem[ FD_TCACHE_FOOTPRINT( TCACHE_DEPTH, TCACHE_MAP_CNT ) ] __attribute__((aligned(FD_TCACHE_ALIGN))); - fd_tcache_t * tcache = fd_tcache_join( fd_tcache_new( tcache_mem, TCACHE_DEPTH, TCACHE_MAP_CNT ) ); - ulong tcache_depth = fd_tcache_depth ( tcache ); - ulong tcache_map_cnt = fd_tcache_map_cnt ( tcache ); - ulong * _tcache_sync = fd_tcache_oldest_laddr( tcache ); - ulong * _tcache_ring = fd_tcache_ring_laddr ( tcache ); - ulong * _tcache_map = fd_tcache_map_laddr ( tcache ); - ulong tcache_oldest = FD_VOLATILE_CONST( *_tcache_sync ); - - ulong accum_ha_filt_cnt = 0UL; ulong accum_ha_filt_sz = 0UL; - - fd_sha512_t _sha[1]; - fd_sha512_t * sha = fd_sha512_join( fd_sha512_new( _sha ) ); - if( FD_UNLIKELY( !sha ) ) FD_LOG_ERR(( "fd_sha512 join failed" )); - - ulong accum_sv_filt_cnt = 0UL; ulong accum_sv_filt_sz = 0UL; - - /* Start verifying */ - - FD_LOG_INFO(( "verify(%lu) run", args->tile_idx )); - - long now = fd_tickcount(); - long then = now; /* Do housekeeping on first iteration of run loop */ - fd_cnc_signal( cnc, FD_CNC_SIGNAL_RUN ); - - ulong sigvfy_pass_cnt =0UL; - ulong sigvfy_fail_cnt =0UL; - (void)sigvfy_pass_cnt; - (void)sigvfy_fail_cnt; - for(;;) { - - /* Do housekeeping at a low rate in the background */ - - if( FD_UNLIKELY( (now-then)>=0L ) ) { - /* - begin verifyin related - */ - - /* Send flow control credits */ - fd_fctl_rx_cr_return( vin_fseq, vin_mcache_seq ); - - /* Send synchronization info */ - - FD_COMPILER_MFENCE(); - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ] ) += vin_accum_pub_cnt; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ] ) += vin_accum_pub_sz; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ] ) += vin_accum_ovrnp_cnt; - FD_VOLATILE( vin_fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ] ) += vin_accum_ovrnr_cnt; - FD_COMPILER_MFENCE(); - vin_accum_pub_cnt = 0UL; - vin_accum_pub_sz = 0UL; - vin_accum_ovrnp_cnt = 0UL; - vin_accum_ovrnr_cnt = 0UL; - - /* - end verifyin related - */ - - /* Send synchronization info */ - fd_mcache_seq_update( sync, seq ); - FD_COMPILER_MFENCE(); - FD_VOLATILE( *_tcache_sync ) = tcache_oldest; - FD_COMPILER_MFENCE(); - - /* Send diagnostic info */ - fd_cnc_heartbeat( cnc, now ); - FD_COMPILER_MFENCE(); - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ] ) + accum_ha_filt_cnt; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ] ) + accum_ha_filt_sz; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ] ) + accum_sv_filt_cnt; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ] ) + accum_sv_filt_sz; - FD_COMPILER_MFENCE(); - accum_ha_filt_cnt = 0UL; - accum_ha_filt_sz = 0UL; - accum_sv_filt_cnt = 0UL; - accum_sv_filt_sz = 0UL; - - /* Receive command-and-control signals */ - ulong s = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_RUN ) ) { - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_HALT ) ) FD_LOG_ERR(( "Unexpected signal" )); - break; - } - - /* Receive flow control credits */ - cr_avail = fd_fctl_tx_cr_update( fctl, cr_avail, seq ); - if( FD_UNLIKELY( in_backp ) ) { - if( FD_LIKELY( cr_avail ) ) { - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ] ) = 0UL; - in_backp = 0; - } - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - } - - /* Check if we are backpressured */ - if( FD_UNLIKELY( !cr_avail ) ) { - if( FD_UNLIKELY( !in_backp ) ) { - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ] ) = 1UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ] )+1UL; - in_backp = 1; - } - FD_SPIN_PAUSE(); - now = fd_tickcount(); - continue; - } - - /* See if there are any transactions waiting to be packed */ - __m128i vin_seq_sig = fd_frag_meta_seq_sig_query( vin_mline ); - ulong vin_seq_found = fd_frag_meta_sse0_seq( vin_seq_sig ); - long vin_diff = fd_seq_diff( vin_seq_found, vin_mcache_seq ); - if( FD_UNLIKELY( vin_diff ) ) { /* caught up or overrun, optimize for expected sequence number ready */ - if( FD_LIKELY( vin_diff < 0L ) ) { - FD_SPIN_PAUSE(); - now = fd_tickcount(); - continue; - } - vin_accum_ovrnp_cnt++; - vin_mcache_seq = vin_seq_found; - /* can keep processing from the new seq */ - } - - ulong vin_sig_found = fd_frag_meta_sse0_sig( vin_seq_sig ); - if( FD_UNLIKELY( vin_sig_found ) ) { /* This is a dummy mcache entry to keep frags from getting overrun, do not process */ - vin_mcache_seq = fd_seq_inc( vin_mcache_seq, 1UL ); - vin_mline = vin_mcache + fd_mcache_line_idx( vin_mcache_seq, vin_mcache_depth ); - continue; - } - - uint chunk = vin_mline->chunk; - - ulong vin_data_sz = (ulong)vin_mline->sz; - uchar * udp_payload = (uchar *)fd_chunk_to_laddr( vin_wksp, chunk ); - - vin_accum_pub_cnt++; - vin_accum_pub_sz += (ulong)vin_mline->sz; - - /* Wind up for the next iteration for verifyin */ - vin_mcache_seq = fd_seq_inc( vin_mcache_seq, 1UL ); - vin_mline = vin_mcache + fd_mcache_line_idx( vin_mcache_seq, vin_mcache_depth ); - - ushort payload_sz = *(ushort*) (udp_payload + vin_data_sz - sizeof(ushort)); - fd_txn_t * txn = (fd_txn_t*) fd_ulong_align_up( (ulong)(udp_payload) + payload_sz, 2UL ); - - ulong const * public_key = (ulong const *)(udp_payload + txn->acct_addr_off); - ulong const * sig = (ulong const *)(udp_payload + txn->signature_off); - uchar const * msg = (uchar const *)(udp_payload + txn->message_off); - ulong msg_sz = (ulong)payload_sz - txn->message_off; - - /* Sig is already effectively a cryptographically secure hash of - public_key/private_key and message and sz. So use this to do a - quick dedup of ha traffic (FIXME: POTENTIAL DOS ATTACK IF - SOMEBODY COULD INTERCEPT TRAFFIC AND SUBMIT PACKETS WITH SAME - PUBLIC KEY, SIG AND GARBAGE MESSAGE AHEAD OF THE TRAFFIC ... - SEEMS UNLKELY AS THEY WOULD EITHER BE BEHIND THE INBOUND OR BE - A MITM THAT COULD JUST DISCARD INBOUND TRAFFIC). - - When running synthetic load though, we only have a very limited - set of messages and this dedup will be overly aggressive (as it - will spuriously matching earlier synthetic packets since they - are not resigned continuously) So we just mock this up for the - time being. */ - - ulong ha_tag = *sig; - int ha_dup; - FD_TCACHE_INSERT( ha_dup, tcache_oldest, _tcache_ring, tcache_depth, _tcache_map, tcache_map_cnt, ha_tag ); - if( FD_UNLIKELY( ha_dup ) ) { /* optimize for the non dup case */ - accum_ha_filt_cnt++; - accum_ha_filt_sz += payload_sz; //WW accum_ha_filt_sz += msg_framing + msg_sz; - now = fd_tickcount(); - continue; - } - - /* We appear to have a message to verify. So verify it. - - When running synthetic load, the synthetic data will not fail - at this point so we fake up some configurable rate of errors to - stress out the monitoring. (We could also slightly more - expensively get the same effect by corrupting the udp_payload - region before the verify.) */ - - int err = fd_ed25519_verify( msg, msg_sz, sig, public_key, sha ); - if (err) { - sigvfy_fail_cnt ++; - //FD_LOG_WARNING(( "fd_ed25519_verify failed for mcache[%lu], fail/pass: %lu/%lu", - // vin_seq_found, sigvfy_fail_cnt, sigvfy_pass_cnt )); - now = fd_tickcount(); - continue; - } - else { - sigvfy_pass_cnt ++; - } - - /* Packet looks superficially good. Forward it. If somebody is - opening multiple connections (which would potentially flow - steered to different verify tiles) and spammed these - connections with the same transaction, ha dedup here is likely - to miss that. But the dedup tile that muxes all the inputs - will take care of that. (The use of QUIC and the like should - also strongly reduce the economic incentives for this - behavior.) - - When running synthetic load, we have the same problem we had - above. So we use a signature that will match with the desired - probability. */ - - /* Note that sig is now guaranteed to be not FD_TCACHE_TAG_NULL - and we use the least significant 64-bits of the SHA-512 hash - for dedup purposes. */ - - now = fd_tickcount(); - ulong tspub = fd_frag_meta_ts_comp( now ); - int ctl_som = 1; - int ctl_eom = 1; - ulong ctl = fd_frag_meta_ctl( 0, ctl_som, ctl_eom, 0 ); - ulong tsorig = tspub; - fd_mcache_publish( mcache, depth, seq, ha_tag, chunk, vin_data_sz, ctl, tsorig, tspub ); - - seq = fd_seq_inc( seq, 1UL ); - cr_avail--; - - if( FD_UNLIKELY( !ctl_eom ) ) ctl_som = 0; - else { - ctl_som = 1; - } - } -} - -static long allow_syscalls[] = { - __NR_write, /* logging */ - __NR_fsync, /* logging, WARNING and above fsync immediately */ -}; - -static ulong -allow_fds( fd_frank_args_t * args, - ulong out_fds_sz, - int * out_fds ) { - (void)args; - if( FD_UNLIKELY( out_fds_sz < 2 ) ) FD_LOG_ERR(( "out_fds_sz %lu", out_fds_sz )); - out_fds[ 0 ] = 2; /* stderr */ - out_fds[ 1 ] = 3; /* logfile */ - return 2; -} - -fd_frank_task_t frank_verify = { - .name = "verify", - .in_wksp = "quic_verify", - .out_wksp = "verify_dedup", - .extra_wksp = "tpu_txn_data", - .allow_syscalls_sz = sizeof(allow_syscalls)/sizeof(allow_syscalls[ 0 ]), - .allow_syscalls = allow_syscalls, - .allow_fds = allow_fds, - .init = NULL, - .run = run, -}; diff --git a/src/app/frank/README.md b/src/disco/README.md similarity index 98% rename from src/app/frank/README.md rename to src/disco/README.md index 452f10cd21..a76b382a04 100644 --- a/src/app/frank/README.md +++ b/src/disco/README.md @@ -1,4 +1,4 @@ -# Frankendancer Frank +# Firedancer ## Architecture @@ -119,9 +119,9 @@ ### Pod layout for low level configuration ``` -[path to this frank instance's config] { +[path to this app instance's config] { - # There are 3 + verify_cnt tiles used by frank. verify_cnt is implied + # There are 3 + verify_cnt tiles used by the app. verify_cnt is implied # by the number of verify pods below. # # The logical tile indices for the main, pack and dedup tiles are @@ -229,7 +229,7 @@ } - # Additional configuration information specific to this frank instance + # Additional configuration information specific to this app instance # (all unrecognized fields will be silently ignored) } diff --git a/src/disco/dedup/Local.mk b/src/disco/dedup/Local.mk index 706abaa603..f639efc15f 100644 --- a/src/disco/dedup/Local.mk +++ b/src/disco/dedup/Local.mk @@ -1,4 +1,3 @@ $(call add-hdrs,fd_dedup.h) $(call add-objs,fd_dedup,fd_disco) -$(call make-bin,fd_dedup_tile,fd_dedup_tile,fd_disco fd_tango fd_util) -$(call make-unit-test,test_dedup,test_dedup,fd_disco fd_tango fd_util) +# $(call make-unit-test,test_dedup,test_dedup,fd_disco fd_tango fd_util) diff --git a/src/disco/dedup/fd_dedup.c b/src/disco/dedup/fd_dedup.c index 8067f997c9..28425e3ff6 100644 --- a/src/disco/dedup/fd_dedup.c +++ b/src/disco/dedup/fd_dedup.c @@ -1,587 +1,155 @@ #include "fd_dedup.h" -/* A fd_dedup_tile_in has all the state needed for deduping frags from - an in. It fits on exactly one cache line. */ +#include "../mux/fd_mux.h" -struct __attribute__((aligned(64))) fd_dedup_tile_in { - fd_frag_meta_t const * mcache; /* local join to this in's mcache */ - ulong depth; /* == fd_mcache_depth( mcache ), depth of this in's cache (const) */ - ulong seq; /* sequence number of next frag expected from the upstream producer, - updated when frag from this in is published / filtered */ - fd_frag_meta_t const * mline; /* == mcache + fd_mcache_line_idx( seq, depth ), location to poll next */ - ulong * fseq; /* local join to the fseq used to return flow control credits the in */ - uint accum[6]; /* local diagnostic accumualtors. These are drained during in housekeeping. */ - /* Assumes FD_FSEQ_DIAG_{PUB_CNT,PUB_SZ,FILT_CNT,FILT_SZ,OVRNP_CNT,OVRNR_CONT} are 0:5 */ -}; +/* fd_dedup_ctx_t is the context object provided to callbacks from the + mux tile, and contains all state needed to progress the tile. */ -typedef struct fd_dedup_tile_in fd_dedup_tile_in_t; +typedef struct { + ulong tcache_depth; /* == fd_tcache_depth( tcache ), depth of this dedups's tcache (const) */ + ulong tcache_map_cnt; /* == fd_tcache_map_cnt( tcache ), number of slots to use for tcache map (const) */ + ulong * tcache_sync; /* == fd_tcache_oldest_laddr( tcache ), local join to the oldest key in the tcache */ + ulong * tcache_ring; + ulong * tcache_map; -/* fd_dedup_tile_in_update returns flow control credits to the in - assuming that there are at most exposed_cnt frags currently exposed - to reliable outs and drains the run-time diagnostics accumulated - since the last update. Note that, once an in sequence number has - been confirmed to have been consumed downstream, it will remain - consumed. So, we can optimize this (and guarantee a monotonically - increasing fseq from the in's point of view) by only sending when - this_in_seq-exposed_cnt ends up ahead of this_in_fseq. We still - drain diagnostics every update as we might still have diagnostic - accumulated since last update even when we don't need to update - this_in_fseq. See note below about quasi-atomic draining. + fd_dedup_in_ctx_t * in; - For a simple example in normal operation of this, consider the case - where, at last update for this in, outs were caught up, and since - then, the dedup forwarded 1 frag from this in, the dedup forwarded 1 - frag from another in, and the outs didn't make any progress on the - forwarded frags. At this point then, for the implementation below, - exposed_cnt will be 2 but this_in_seq will have advanced only 1 such - that this_in_seq-exposed_cnt will be before this_in_fseq. Thus, we - will have diagnostics to accumulate for this in but no update needed - for this_in_fseq. */ + void * out_wksp; + ulong out_chunk0; + ulong out_wmark; + ulong out_chunk; +} fd_dedup_ctx_t; -static inline void -fd_dedup_tile_in_update( fd_dedup_tile_in_t * in, - ulong exposed_cnt ) { - - /* Technically we don't need to use fd_fseq_query here as *in_fseq - is not volatile from the dedup's point of view. But we are - paranoid, it won't affect performance in this case and it is - consistent with typical fseq usages. */ +/* during_frag is called between pairs for sequence number checks, as + we are reading incoming frags. We don't actually need to copy the + fragment here, flow control prevents it getting overrun, and + downstream consumers could reuse the same chunk and workspace to + improve performance. - ulong * in_fseq = in->fseq; - ulong seq = fd_seq_dec( in->seq, exposed_cnt ); - if( FD_LIKELY( fd_seq_gt( seq, fd_fseq_query( in_fseq ) ) ) ) fd_fseq_update( in_fseq, seq ); + The bounds checking and copying here are defensive measures, - ulong * diag = (ulong *)fd_fseq_app_laddr( in_fseq ); - uint * accum = in->accum; - ulong a0 = (ulong)accum[0]; ulong a1 = (ulong)accum[1]; ulong a2 = (ulong)accum[2]; - ulong a3 = (ulong)accum[3]; ulong a4 = (ulong)accum[4]; ulong a5 = (ulong)accum[5]; - FD_COMPILER_MFENCE(); - diag[0] += a0; diag[1] += a1; diag[2] += a2; - diag[3] += a3; diag[4] += a4; diag[5] += a5; - FD_COMPILER_MFENCE(); - accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; - accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; -} + * In a functioning system, the bounds checking should never fail, + but we want to prevent an attacker with code execution on a producer + tile from trivially being able to jump to a consumer tile with + out of bounds chunks. + + * For security reasons, we have chosen to isolate all workspaces from + one another, so for example, if the QUIC tile is compromised with + RCE, it cannot wait until the sigverify tile has verified a transaction, + and then overwrite the transaction while it's being processed by the + banking stage. */ + +static inline void +during_frag( void * _ctx, + ulong in_idx, + ulong sig, + ulong chunk, + ulong sz, + int * opt_filter ) { + (void)sig; + (void)opt_filter; + + fd_dedup_ctx_t * ctx = (fd_dedup_ctx_t *)_ctx; -#define SCRATCH_ALLOC( a, s ) (__extension__({ \ - ulong _scratch_alloc = fd_ulong_align_up( scratch_top, (a) ); \ - scratch_top = _scratch_alloc + (s); \ - (void *)_scratch_alloc; \ - })) + if( FD_UNLIKELY( chunkin[ in_idx ].chunk0 || chunk>=ctx->in[ in_idx ].wmark || sz > FD_TPU_DCACHE_MTU ) ) + FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu)", chunk, sz, ctx->in[ in_idx ].chunk0, ctx->in[ in_idx ].wmark )); -FD_STATIC_ASSERT( alignof(fd_dedup_tile_in_t)<=FD_DEDUP_TILE_SCRATCH_ALIGN, packing ); + uchar * src = (uchar *)fd_chunk_to_laddr( ctx->in[in_idx].wksp, chunk ); + uchar * dst = (uchar *)fd_chunk_to_laddr( ctx->out_wksp, ctx->out_chunk ); -ulong -fd_dedup_tile_scratch_align( void ) { - return FD_DEDUP_TILE_SCRATCH_ALIGN; + fd_memcpy( dst, src, sz ); } -ulong -fd_dedup_tile_scratch_footprint( ulong in_cnt, - ulong out_cnt ) { - if( FD_UNLIKELY( in_cnt >FD_DEDUP_TILE_IN_MAX ) ) return 0UL; - if( FD_UNLIKELY( out_cnt>FD_DEDUP_TILE_OUT_MAX ) ) return 0UL; - ulong scratch_top = 0UL; - SCRATCH_ALLOC( alignof(fd_dedup_tile_in_t), in_cnt*sizeof(fd_dedup_tile_in_t) ); /* in */ - SCRATCH_ALLOC( alignof(ulong const *), out_cnt*sizeof(ulong const *) ); /* out_fseq */ - SCRATCH_ALLOC( alignof(ulong *), out_cnt*sizeof(ulong *) ); /* out_slow */ - SCRATCH_ALLOC( alignof(ulong), out_cnt*sizeof(ulong) ); /* out_seq */ - SCRATCH_ALLOC( alignof(ushort), (in_cnt+out_cnt+1UL)*sizeof(ushort) ); /* event_map */ - return fd_ulong_align_up( scratch_top, fd_dedup_tile_scratch_align() ); +/* After the transaction has been fully received, and we know we were + not overrun while reading it, check if it's a duplicate of a prior + transaction. */ + +static inline void +after_frag( void * _ctx, + ulong * opt_sig, + ulong * opt_chunk, + ulong * opt_sz, + int * opt_filter ) { + fd_dedup_ctx_t * ctx = (fd_dedup_ctx_t *)_ctx; + + int is_dup; + FD_TCACHE_INSERT( is_dup, *ctx->tcache_sync, ctx->tcache_ring, ctx->tcache_depth, ctx->tcache_map, ctx->tcache_map_cnt, *opt_sig ); + *opt_filter = is_dup; + if( FD_LIKELY( !*opt_filter ) ) { + *opt_chunk = ctx->out_chunk; + ctx->out_chunk = fd_dcache_compact_next( ctx->out_chunk, *opt_sz, ctx->out_chunk0, ctx->out_wmark ); + } } int fd_dedup_tile( fd_cnc_t * cnc, + ulong pid, ulong in_cnt, fd_frag_meta_t const ** in_mcache, ulong ** in_fseq, + uchar const ** in_dcache, fd_tcache_t * tcache, fd_frag_meta_t * mcache, + uchar * dcache, ulong out_cnt, - ulong ** _out_fseq, + ulong ** out_fseq, ulong cr_max, long lazy, fd_rng_t * rng, - void * scratch, - double tick_per_ns ) { - - /* cnc state */ - ulong * cnc_diag; /* ==fd_cnc_app_laddr( cnc ), local address of the dedup tile cnc diagnostic region */ - ulong cnc_diag_in_backp; /* is the run loop currently backpressured by one or more of the outs, in [0,1] */ - ulong cnc_diag_backp_cnt; /* Accumulates number of transitions of tile to backpressured between housekeeping events */ - - /* in frag stream state */ - ulong in_seq; /* current position in input poll sequence, in [0,in_cnt) */ - fd_dedup_tile_in_t * in; /* in[in_seq] for in_seq in [0,in_cnt) has information about input fragment stream currently at - position in_seq in the in_idx polling sequence. The ordering of this array is continuously - shuffled to avoid lighthousing effects in the output fragment stream at extreme fan-in and load */ - - /* tcache filter state */ - ulong tcache_depth; /* ==fd_tcache_depth ( tcache ), maximum unique sigs held by the tcache */ - ulong tcache_map_cnt; /* ==fd_tcache_map_cnt ( tcache ), number of map slots, integer power of 2 >= depth+2 */ - ulong * _tcache_sync; /* ==fd_tcache_oldest_laddr( tcache ), location where tcache sync info is updated */ - ulong * _tcache_ring; /* ==fd_tcache_ring_laddr ( tcache ), ring of unique sigs, indexed [0,depth) */ - ulong * _tcache_map; /* ==fd_tcache_map_laddr ( tcache ), map slots, indexed [0,map_cnt) */ - ulong tcache_sync; /* location of the oldest signature in ring, in [0,depth) */ + void * scratch ) { + fd_dedup_ctx_t ctx[1]; - /* out frag stream state */ - ulong depth; /* ==fd_mcache_depth( mcache ), depth of the mcache / positive integer power of 2 */ - ulong * sync; /* ==fd_mcache_seq_laddr( mcache ), local addr where dedup mcache sync info is published */ - ulong seq; /* next dedup frag sequence number to publish */ + fd_mux_callbacks_t callbacks[1] = { 0 }; + callbacks->during_frag = during_frag; + callbacks->after_frag = after_frag; - /* out flow control state */ - ulong cr_avail; /* number of flow control credits available to publish downstream, in [0,cr_max] */ - ulong cr_filt; /* number of filtered fragments we need to account for in the flow control state */ - ulong const ** out_fseq; /* out_fseq[out_idx] for out_idx in [0,out_cnt) is where to receive fctl credits from outs */ - ulong ** out_slow; /* out_slow[out_idx] for out_idx in [0,out_cnt) is where to accumulate slow events */ - ulong * out_seq; /* out_seq [out_idx] is the most recent observation of out_fseq[out_idx] */ - - /* housekeeping state */ - ulong event_cnt; /* ==in_cnt+out_cnt+1, total number of housekeeping events */ - ulong event_seq; /* current position in housekeeping event sequence, in [0,event_cnt) */ - ushort * event_map; /* current mapping of event_seq to event idx, event_map[ event_seq ] is next event to process */ - ulong async_min; /* minimum number of ticks between processing a housekeeping event, positive integer power of 2 */ + ulong scratch_top = (ulong)scratch; do { - - FD_LOG_INFO(( "Booting dedup (in-cnt %lu, out-cnt %lu)", in_cnt, out_cnt )); - if( FD_UNLIKELY( in_cnt >FD_DEDUP_TILE_IN_MAX ) ) { FD_LOG_WARNING(( "in_cnt too large" )); return 1; } - if( FD_UNLIKELY( out_cnt>FD_DEDUP_TILE_OUT_MAX ) ) { FD_LOG_WARNING(( "out_cnt too large" )); return 1; } - - if( FD_UNLIKELY( !scratch ) ) { - FD_LOG_WARNING(( "NULL scratch" )); - return 1; - } - - if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)scratch, fd_dedup_tile_scratch_align() ) ) ) { - FD_LOG_WARNING(( "misaligned scratch" )); - return 1; - } - - ulong scratch_top = (ulong)scratch; - - /* cnc state init */ - - if( FD_UNLIKELY( !cnc ) ) { FD_LOG_WARNING(( "NULL cnc" )); return 1; } - if( FD_UNLIKELY( fd_cnc_app_sz( cnc )<16UL ) ) { FD_LOG_WARNING(( "cnc app sz must be at least 16" )); return 1; } - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) { FD_LOG_WARNING(( "already booted" )); return 1; } - - cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - - /* in_backp==1, backp_cnt==0 indicates waiting for initial credits, - cleared during first housekeeping if credits available */ - cnc_diag_in_backp = 1UL; - cnc_diag_backp_cnt = 0UL; - - /* in frag stream init */ - - in_seq = 0UL; /* First in to poll */ - in = (fd_dedup_tile_in_t *)SCRATCH_ALLOC( alignof(fd_dedup_tile_in_t), in_cnt*sizeof(fd_dedup_tile_in_t) ); - - ulong min_in_depth = (ulong)LONG_MAX; - - if( FD_UNLIKELY( !!in_cnt && !in_mcache ) ) { FD_LOG_WARNING(( "NULL in_mcache" )); return 1; } - if( FD_UNLIKELY( !!in_cnt && !in_fseq ) ) { FD_LOG_WARNING(( "NULL in_fseq" )); return 1; } - for( ulong in_idx=0UL; in_idxmcache = in_mcache[ in_idx ]; - this_in->fseq = in_fseq [ in_idx ]; - ulong const * this_in_sync = fd_mcache_seq_laddr_const( this_in->mcache ); - - this_in->depth = fd_mcache_depth( this_in->mcache ); min_in_depth = fd_ulong_min( min_in_depth, this_in->depth ); - this_in->seq = fd_mcache_seq_query( this_in_sync ); /* FIXME: ALLOW OPTION FOR MANUAL SPECIFICATION? */ - this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); - - this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; - this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; - } - - /* tcache filter init */ - + if( FD_UNLIKELY( !dcache ) ) { FD_LOG_WARNING(( "NULL dcache" )); return 1; } if( FD_UNLIKELY( !tcache ) ) { FD_LOG_WARNING(( "NULL tcache" )); return 1; } - tcache_depth = fd_tcache_depth ( tcache ); - tcache_map_cnt = fd_tcache_map_cnt ( tcache ); - _tcache_sync = fd_tcache_oldest_laddr( tcache ); - _tcache_ring = fd_tcache_ring_laddr ( tcache ); - _tcache_map = fd_tcache_map_laddr ( tcache ); - - FD_COMPILER_MFENCE(); - tcache_sync = FD_VOLATILE_CONST( *_tcache_sync ); - FD_COMPILER_MFENCE(); - - /* out frag stream init */ - - if( FD_UNLIKELY( !mcache ) ) { FD_LOG_WARNING(( "NULL mcache" )); return 1; } - - depth = fd_mcache_depth ( mcache ); - sync = fd_mcache_seq_laddr( mcache ); - - seq = fd_mcache_seq_query( sync ); /* FIXME: ALLOW OPTION FOR MANUAL SPECIFICATION */ - - /* out flow control init */ - - /* Since cr_avail is decremented everytime a frag is exposed to the - outs by the dedup, exposed_cnt=cr_max-cr_avail is the number - frags that are currently exposed. Similarly there might be up to - cr_filt duplicate frags that were filtered. Exposed frags can be - arbitrarily distributed over all ins and, in the worst case, - could all be from just one particular in. - - When the dedup sends flow control credits to an in, the dedup - decrements the actual dedup's position in sequence space by this - upper bound. This guarantees, even in the worst case (all frags - exposed downstream and filtered came from the smallest depth in), - the ins will never overrun any outs. It also means that the - dedup tile doesn't have to keep track of these frags are - distributed over the ins (simplifying the implementation and - increasing performance). - - This also implies that cr_max must be at most - min(in_mcache[*].depth) such that the dedup cannot expose a frag - from further back than the in itself can cache. It also can't be - larger than depth such that at most_depth frags will ever be - exposed to outs. - - This further implies that the dedup should continuously replenish - its credits from the outs whenever it has less than cr_max - credits. To see this, we can just apply the analysis from the - mux tile in the case where there is 100% unique frags. - - With continuous replenishing, exposed_cnt will always improve as - the outs make progress and filt_cnt can always be cleared whenever - exposed_cnt gets to 0. This improvement will then always be - reflected all the way to all the ins, preventing deadlock / - livelock situations. (Note this also handles the situation like - dedup_lag==in_cr_max and cr_filt==0 as, when exposed count - improves, it will make credits available for publication that the - dedup can use because it will see there are fragments ready for - publication in the in's mcache given the positive dedup_lag and - will advance dedup_in_seq accordingly when it publishes them). - - Since we need to continuously replenish our credits when all the - outs aren't full caught up and we want to optimize for the common - scenario of very deep buffers and large number of outputs, we do - not use the fctl object (we would need to turn off its state - machine and we would like to avoid the bursts of reads it would - do when replenishing). Instead, we use a customized flow control - algorithm here to lazily and stochastically observe, without - bursts, the fseqs continuously. - - Note that the default value for cr_max assumes that - in[*].depth==in[*].cr_max and out[*].lag_max==dedup.depth (which - also is optimize for the low duplication scenarios). The user - can override cr_max to handle more general application specific - situations. */ - - ulong cr_max_max = fd_ulong_min( min_in_depth, depth ); - if( !cr_max ) cr_max = cr_max_max; /* use default */ - FD_LOG_INFO(( "Using cr_max %lu", cr_max )); - if( FD_UNLIKELY( !((1UL<=cr_max) & (cr_max<=cr_max_max)) ) ) { - FD_LOG_WARNING(( "cr_max %lu must be in [1,%lu] for these mcaches", cr_max, cr_max_max )); - return 1; - } - - cr_avail = 0UL; /* Will be initialized by run loop */ - cr_filt = 0UL; - - out_fseq = (ulong const **)SCRATCH_ALLOC( alignof(ulong const *), out_cnt*sizeof(ulong const *) ); - out_slow = (ulong **) SCRATCH_ALLOC( alignof(ulong *), out_cnt*sizeof(ulong *) ); - out_seq = (ulong *) SCRATCH_ALLOC( alignof(ulong), out_cnt*sizeof(ulong) ); - - if( FD_UNLIKELY( !!out_cnt && !_out_fseq ) ) { FD_LOG_WARNING(( "NULL out_fseq" )); return 1; } - for( ulong out_idx=0UL; out_idx=0L ) ) { - ulong event_idx = (ulong)event_map[ event_seq ]; - - /* Do the next async event. event_idx: - out_cnt - send credits to in event_idx - out_cnt - 1. - Branch hints and order are optimized for the case: - out_cnt >~ in_cnt >~ 1. */ - - if( FD_LIKELY( event_idxout_cnt ) ) { /* in fctl for in in_idx */ - ulong in_idx = event_idx - out_cnt - 1UL; - - /* Send flow control credits and drain flow control diagnostics - for in_idx. At this point, there are at most - exposed_cnt=cr_max-cr_avail frags exposed to reliable - consumers mixed with up to cr_filt frags that got filtered by - the dedup. We don't know how these frags were distributed - across all ins but, in the worst case, they all might have - come from the this in. The sequence number of the oldest - exposed frag then is at least cr_max-cr_avail+cr_filt before - the next sequence number the dedup expects to receive from - that in (e.g. the dedup might have received cr_max-cr_avail - exposed frags first followed by cr_filt frags that got - filtered as duplicates). */ - - fd_dedup_tile_in_update( &in[ in_idx ], cr_max - cr_avail + cr_filt ); - - } else { /* event_idx==out_cnt, housekeeping event */ - - /* Send synchronization info */ - fd_mcache_seq_update( sync, seq ); - FD_COMPILER_MFENCE(); - FD_VOLATILE( *_tcache_sync ) = tcache_sync; - FD_COMPILER_MFENCE(); - - /* Send diagnostic info */ - /* When we drain, we don't do a fully atomic update of the - diagnostics as it is only diagnostic and it will still be - correct the usual case where individual diagnostic counters - aren't used by multiple writers spread over different threads - of execution. */ - fd_cnc_heartbeat( cnc, now ); - FD_COMPILER_MFENCE(); - cnc_diag[ FD_CNC_DIAG_IN_BACKP ] = cnc_diag_in_backp; - cnc_diag[ FD_CNC_DIAG_BACKP_CNT ] += cnc_diag_backp_cnt; - FD_COMPILER_MFENCE(); - cnc_diag_backp_cnt = 0UL; - - /* Receive command-and-control signals */ - ulong s = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_RUN ) ) { - if( FD_LIKELY( s==FD_CNC_SIGNAL_HALT ) ) break; - if( FD_UNLIKELY( s!=FD_DEDUP_CNC_SIGNAL_ACK ) ) { - char buf[ FD_CNC_SIGNAL_CSTR_BUF_MAX ]; - FD_LOG_WARNING(( "Unexpected signal %s (%lu) received; trying to resume", fd_cnc_signal_cstr( s, buf ), s )); - } - fd_cnc_signal( cnc, FD_CNC_SIGNAL_RUN ); - } - - /* Receive flow control credits. */ - - if( FD_LIKELY( cr_avail=event_cnt ) ) { - event_seq = 0UL; - - /* Randomize the order of event processing for the next event - event_cnt events to avoid lighthousing effects causing input - credit starvation at extreme fan in/fan out, extreme in load - and high credit return laziness. */ - - ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); - ushort map_tmp = event_map[ swap_idx ]; - event_map[ swap_idx ] = event_map[ 0 ]; - event_map[ 0 ] = map_tmp; - - /* We also do the same with the ins to prevent there being a - correlated order frag origins from different inputs - downstream at extreme fan in and extreme in load. */ - - if( FD_LIKELY( in_cnt>1UL ) ) { - swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); - fd_dedup_tile_in_t in_tmp; - in_tmp = in[ swap_idx ]; - in[ swap_idx ] = in[ 0 ]; - in[ 0 ] = in_tmp; - } - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - } - - /* Check if we are backpressured. If so, count any transition into - a backpressured regime and spin to wait for flow control credits - to return. We don't do a fully atomic update here as it is only - diagnostic and it will still be correct in the usual case where - individual diagnostic counters aren't used by writers in - different threads of execution. We only count the transition - from not backpressured to backpressured. */ - - if( FD_UNLIKELY( cr_avail<=cr_filt ) ) { - cnc_diag_backp_cnt += (ulong)!cnc_diag_in_backp; - cnc_diag_in_backp = 1UL; - FD_SPIN_PAUSE(); - now = fd_tickcount(); - continue; - } - cnc_diag_in_backp = 0UL; - - /* Select which in to poll next (randomized round robin) */ - - if( FD_UNLIKELY( !in_cnt ) ) { now = fd_tickcount(); continue; } - fd_dedup_tile_in_t * this_in = &in[ in_seq ]; - in_seq++; - if( in_seq>=in_cnt ) in_seq = 0UL; /* cmov */ - - /* Check if this in has any new fragments to dedup */ - - ulong this_in_seq = this_in->seq; - fd_frag_meta_t const * this_in_mline = this_in->mline; /* Already at appropriate line for this_in_seq */ - - FD_COMPILER_MFENCE(); - ulong seq_found = this_in_mline->seq; - FD_COMPILER_MFENCE(); - - long diff = fd_seq_diff( this_in_seq, seq_found ); - if( FD_UNLIKELY( diff ) ) { /* Caught up or overrun, optimize for new frag case */ - if( FD_UNLIKELY( diff<0L ) ) { /* Overrun (impossible if in is honoring our flow control) */ - this_in->seq = seq_found; /* Resume from here (probably reasonably current, could query in mcache sync directly instead) */ - this_in->accum[ FD_FSEQ_DIAG_OVRNP_CNT ]++; + ctx->tcache_depth = fd_tcache_depth ( tcache ); + ctx->tcache_map_cnt = fd_tcache_map_cnt ( tcache ); + ctx->tcache_sync = fd_tcache_oldest_laddr( tcache ); + ctx->tcache_ring = fd_tcache_ring_laddr ( tcache ); + ctx->tcache_map = fd_tcache_map_laddr ( tcache ); + + ctx->in = (fd_dedup_in_ctx_t*)SCRATCH_ALLOC( alignof(fd_dedup_in_ctx_t), in_cnt*sizeof(fd_dedup_in_ctx_t) ); + for( ulong i=0; iin[i].wksp = fd_wksp_containing( in_dcache[i] ); + ctx->in[i].chunk0 = fd_dcache_compact_chunk0( ctx->in[i].wksp, in_dcache[i] ); + ctx->in[i].wmark = fd_dcache_compact_wmark ( ctx->in[i].wksp, in_dcache[i], FD_TPU_DCACHE_MTU ); } - /* We have a new fragment to dedup. Try to load it. This attempt - should always be successful if in producers are honoring our flow - control. Since we can cheaply detect if there are - misconfigurations (should be an L1 cache hit / predictable branch - in the properly configured case), we do so anyway. Note that if - we are on a platform where AVX is atomic, this could be replaced - by a flat AVX load of the metadata and an extraction of the found - sequence number for higher performance. */ - - FD_COMPILER_MFENCE(); - ulong sig = this_in_mline->sig; - ulong chunk = (ulong)this_in_mline->chunk; - ulong sz = (ulong)this_in_mline->sz; - ulong ctl = (ulong)this_in_mline->ctl; - ulong tsorig = (ulong)this_in_mline->tsorig; - FD_COMPILER_MFENCE(); - ulong seq_test = this_in_mline->seq; - FD_COMPILER_MFENCE(); - - if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { /* Overrun while reading (impossible if this_in honoring our fctl) */ - this_in->seq = seq_test; /* Resume from here (probably reasonably current, could query in mcache sync instead) */ - this_in->accum[ FD_FSEQ_DIAG_OVRNR_CNT ]++; - /* Don't bother with spin as polling multiple locations */ - now = fd_tickcount(); - continue; - } - - /* We have successfully loaded the metadata. Decide whether it - is interesting downstream and publish or filter accordingly. */ - - int is_dup; - FD_TCACHE_INSERT( is_dup, tcache_sync, _tcache_ring, tcache_depth, _tcache_map, tcache_map_cnt, sig ); - if( FD_UNLIKELY( is_dup ) ) { /* Optimize for forwarding path */ - now = fd_tickcount(); - /* If there are any frags from this in that are currently exposed - downstream, this frag needs to be taken into acount in the flow - control info we send to this in (see note above). Since we do - not track the distribution of the source of exposed frags (or - how filtered frags might be interspersed with them), we do not - know this exactly. But we do not need to for flow control - purposes. If cr_avail==cr_max, we are guaranteed nothing is - exposed at all from this in (because nothing is exposed from - any in). If cr_availseq = this_in_seq; - this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); - - ulong diag_idx = FD_FSEQ_DIAG_PUB_CNT + 2UL*(ulong)is_dup; - this_in->accum[ diag_idx ]++; - this_in->accum[ diag_idx+1UL ] += (uint)sz; - } - - do { - - FD_LOG_INFO(( "Halting dedup" )); - - while( in_cnt ) { - ulong in_idx = --in_cnt; - fd_dedup_tile_in_t * this_in = &in[ in_idx ]; - fd_dedup_tile_in_update( this_in, 0UL ); /* exposed_cnt 0 assumes all reliable consumers caught up or shutdown */ + if( FD_UNLIKELY( !fd_dcache_compact_is_safe( fd_wksp_containing( dcache ), dcache, FD_TPU_DCACHE_MTU, fd_mcache_depth( mcache ) ) ) ) { + FD_LOG_WARNING(( "dcache not compatible with wksp base and mcache depth" )); + return 1; } - - FD_LOG_INFO(( "Halted dedup" )); - fd_cnc_signal( cnc, FD_CNC_SIGNAL_BOOT ); - + ctx->out_wksp = fd_wksp_containing( dcache ); + ctx->out_chunk0 = fd_dcache_compact_chunk0( ctx->out_wksp, dcache ); + ctx->out_wmark = fd_dcache_compact_wmark ( ctx->out_wksp, dcache, FD_TPU_DCACHE_MTU ); + ctx->out_chunk = ctx->out_chunk0; } while(0); - return 0; + return fd_mux_tile( cnc, + pid, + FD_MUX_FLAG_COPY, /* dedup copies frags, and does not run zero copy */ + in_cnt, + in_mcache, + in_fseq, + mcache, + out_cnt, + out_fseq, + cr_max, + lazy, + rng, + (void*)fd_ulong_align_up( scratch_top, FD_MUX_TILE_SCRATCH_ALIGN ), + ctx, + callbacks ); } - -#undef SCRATCH_ALLOC - diff --git a/src/disco/dedup/fd_dedup.h b/src/disco/dedup/fd_dedup.h index d705252af3..45e022ae93 100644 --- a/src/disco/dedup/fd_dedup.h +++ b/src/disco/dedup/fd_dedup.h @@ -4,261 +4,49 @@ /* fd_dedup provides services to deduplicate multiple streams of input fragments and present them to a mix of reliable and unreliable consumers as though they were generated by a single multi-stream - producer. The entire process is zero copy for the actual fragment - payloads and thus has extremely high throughput and extremely high - scalability. */ + producer. -#include "../fd_disco_base.h" - -/* Beyond the standard FD_CNC_SIGNAL_HALT, FD_DEDUP_CNC_SIGNAL_ACK can - be raised by a cnc thread with an open command session while the - dedup is in the RUN state. The dedup will transition from ACK->RUN - the next time it processes cnc signals to indicate it is running - normally. If a signal other than ACK, HALT, or RUN is raised, it - will be logged as unexpected and transitioned by back to RUN. */ + The dedup tile is simply a wrapper around the mux tile, that also + checks the transaction signature field for duplicates and filters + them out. */ -#define FD_DEDUP_CNC_SIGNAL_ACK (4UL) - -/* FD_DEDUP_TILE_IN_MAX and FD_DEDUP_TILE_OUT_MAX are the maximum number - of inputs and outputs respectively that a dedup tile can have. These - limits are more or less arbitrary from a functional correctness POV. - They mostly exist to set some practical upper bounds for things like - scratch footprint. The current value for IN_MAX is large enough to - have every possible frag meta origin be handled by a single thread. - (And out_max is set arbitrarily to match.) */ +#include "../fd_disco_base.h" -#define FD_DEDUP_TILE_IN_MAX FD_FRAG_META_ORIG_MAX -#define FD_DEDUP_TILE_OUT_MAX FD_FRAG_META_ORIG_MAX +/* fd_dedup_in_ctx_t is a context object for each in (producer) mcache + connected to the dedup tile. */ -/* FD_DEDUP_TILE_SCRATCH_{ALIGN,FOOTPRINT} specify the alignment and - footprint needed for a dedup tile scratch region that can support - in_cnt mcaches and out_cnt reliable outputs. ALIGN is an integer - power of 2 of at least double cache line to mitigate various kinds of - false sharing. FOOTPRINT will be an integer multiple of ALIGN. - {in,out}_cnt are assumed to be valid (i.e. at most - FD_DEDUP_TILE_{IN,OUT}_MAX). in_cnt and out_cnt are assumed to be - valid and safe against multiple evaluation. These are provided to - facilitate compile time declarations. */ +typedef struct { + void * wksp; + ulong chunk0; + ulong wmark; +} fd_dedup_in_ctx_t; #define FD_DEDUP_TILE_SCRATCH_ALIGN (128UL) -#define FD_DEDUP_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) \ - FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( \ - FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ - 64UL, (in_cnt)*64UL ), \ - alignof(ulong *), (out_cnt)*sizeof(ulong *) ), \ - alignof(ulong *), (out_cnt)*sizeof(ulong *) ), \ - alignof(ulong), (out_cnt)*sizeof(ulong) ), \ - alignof(ushort), ((in_cnt)+(out_cnt)+1UL)*sizeof(ushort) ), \ +#define FD_DEDUP_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) \ + FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ + alignof(fd_dedup_in_ctx_t), (in_cnt)*sizeof(fd_dedup_in_ctx_t) ), \ + FD_MUX_TILE_SCRATCH_ALIGN, FD_MUX_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) ), \ FD_DEDUP_TILE_SCRATCH_ALIGN ) FD_PROTOTYPES_BEGIN -/* fd_dedup_tile deduplicates multiple fragment streams described by the - in_mcaches into a single out_mcache that can be consumed by out_cnt - reliable consumers and an arbitrary number of unreliable consumers. - (While reliable consumers are simple to reason about, they have - especially high demands on their implementation as a single slow - reliable consumer can backpressure _all_ producers and _all_ other - consumers using the dedup.) - - The dedup tile uses the tag cache tcache and the frag metadata - signature field (sig) to do the deduplication. A frag is considered - a duplicate of another frag if its signature is found in the tcache. - When the dedup tile encounters a frag that is not a duplicate by this - definition, it will insert that frag's signature into the tcache - (evicting the oldest signature in the tcache when, as is typically - the case, the tcache is full). That is, after startup (i.e. the - tcache has seen at least depth unique frag signatures), this will - discard frags that whose signatures match any of the most recent - tcache depth unique signatures observed by the dedup tile. - - IMPORTANT! Strictly speaking, the dedup tile does not care about the - specifics of the tagging scheme other than signature method should - not produce a sig of FD_TCACHE_TAG_NULL. At the same time, this - implementation is strongly optimized for the case where sigs for - distinct frags are temporally uncorrelated (e.g. a quality hash of - the frag payload). - - DEDUP TILE THROUGHPUT CAN BE SIGNIFICANTLY DEGRADED IF SIGS FOR - DISTINCT FRAGS ARE TEMPORALLY CORRELATED. - - For example, using a raw payload sequence number for the sig is - extremely ill advised. But a quality integer-to-integer - non-cryptographic full-avalanche hash of a raw payload sequence - number can be useful especially in non-adversial cases and/or if the - hash function used is picked randomly from a large parameterized - family of such hashes. Likewise, a cryptographically secure hash - based signature is also useful here (and might already be available - in many common use cases) as it will provides strong guarantees in - adversarial cases. - - The order of frags among a group of streams covered by a single - in_mcache will be preserved. Frags from different groups of streams - can be arbitrarily interleaved (but this makes an extreme best effort - to avoid starvation and minimize slip between different groups of - streams). - - The sig, chunk, sz, ctl and tsorig input fragment metadata will be - unchanged by this tile. - - For seq, the dedup tile will resequence the unique frags from the - in_mcaches into a new total order consistent with the above ordering - guarantees. - - For ctl, it is up to the application to specify ctl for all streams - covered by the in_mcaches in a non-conflicting way. Specifically, at - any given time, ctl.orig field should uniquely identify an active - logical publisher such that a conusmer can correctly reassemble - multiple fragment messages from that ctl.orig. (As such, ctl.orig - could be used more flexibly if an application never does multiple - fragment messages.) - - For chunk, a consumer needs to be able to map a (ctl.orig,chunk) pair - to an address in that consumer's local address space. The simplest - and most performant way to do this (especially in simple NUMA - topologies) is to have all dcache's use the same workspace and have - each producer reference chunks relative to the containing workspace. - - For tsorig and tspub, the dedup tile will recompute tspub for - deduplicated fragments. Assuming the original publisher of the frag - set tsorig of the the fragment to when it started producing the - message to which the frag belongs and set tspub to the timestamp to - when it first published the frag, and that the producer, dedup and - consumer all have access to the same clock, a downstream consumer can - tell when a message started arriving, when it was first available to - for consumption and (by locally reading the clock) the time when it - actually started consuming. And the logic for doing so on the - consumer will be the same on the consumer regardless it is consuming - directly or through one or more rounds of deduping. - - When this is called, the cnc should be in the BOOT state. Returns 0 - on a successful run of the dedup tile. That is, the tile booted - successfully (transitioning the cnc from BOOT->RUN), ran (handling - any application specific cnc signals while running), and (after - receiving a HALT signal) halted successfully (transitioning the cnc - from HALT->BOOT before return). Returns a non-zero error code if the - tile fails to boot up (logs details ... the cnc will not be - transitioned from its original state and thus is likely bootable - again if its original state was BOOT). For maximally robust - operation in the current implementation, all reliable consumers - should be halted and/or caught up before this tile is halted. - - There are no theoretical restrictions on the fragment stream - in_mcache depths. Practically, it is recommend these be as large as - possible, especially for bursty streams and/or a large number of - reliable consumers. Similarly, there is no advantage from the - dedup's POV to using variable in_mcache depths. But there can be - unrelated reasons for variable mcache depths (e.g. hardware - requirements for a frag stream produced by custom hardware, needs for - non-dedup consumers of individual frag streams, etc). There might be - some marginal theoetical memory footprint benefits to using an - out_mcache depth smaller than in_depth when there are high levels of - duplication but since memory footprint is relatively cheap and worst - case usage patterns should cover the range from no duplication to - 100% duplication, this is unlikely practically to matter. There is - similarly no benefit from the dedup's POV to using a mcache depth - larger than the smallest input mcache (larger cannot be fully - utilized by the downstream outs due to the worst case scenarios with - the smallest in mcache). - - Note that a number of tricks can be done to facilitate making this - work with completely unreliable / non-backpressuring communications - from producer to dedup and dedup to consumer. The most efficient - trick being that producers tag their payloads uniquely with the - metadata sig. When an unreliable consumer reads the metadata from - the mcache, it learns the tag and then can read the payload from - direct from the in dcache (no communication links need to be reliable - in this regime and no verification read of the metadata is required - either ... does require some payload formatting requirements). This - is currently not done in the interest of generality (more - pedantically, this more about how applications handle fragment - streams and less about how the dedup tile functions). - - cr_max is the maximum number of flow control credits the dedup tile - is allowed for publishing frags to outs. It represents the maximum - number of frags a reliable out can lag behind the deduped stream and - the maximum number of frags from any in mcache that might be exposed - to the outs (because of deduplication, the _range_ of exposed frags - might be _larger_ than cr_max). Assuming all unique frags, in the - general case, the optimal value is usually - min(in[*].cr_max,out[*].lag_max). Noting that in[*].cr_max is in - [1,in_mcache[*].depth] and out[*].lag_max is in [1,mcache.depth], - cr_max must be in, at a minimum, - [1,min(in_mcache[*].depth,mcache.depth)]. If cr_max is zero, this - use a default cr_max of min(in_mcache[*].depth,mcache.depth). This - is equivalent to assuming, as is typically the case, outs are allowed - to lag the dedup by up to mcache.depth frags and in[*].cr_max is the - same as in_mcache[*].depth. - - lazy is the ballpark interval in ns for how often to receive credits - from an out (and, equivalently, how often to return credits to an - in). Too small a lazy will drown the system in cache coherence - traffic. Too large a lazy will kill system throughput because of - producers stalled waiting for credits. lazy should be roughly - proportional to cr_max and the constant of proportionality should be - less than the smaller of how fast a producer can generate frags / how - fast a consumer can process frags typically. <=0 indicates to pick a - conservative default. - - scratch points to tile scratch memory. fd_dedup_tile_scratch_align - and fd_dedup_tile_scratch_footprint return the required alignment and - footprint needed for this region. This memory region is exclusively - owned by the dedup tile while the tile is running and is ideally near - the core running the dedup tile. fd_dedup_tile_scratch_align will - return the same value as FD_DEDUP_TILE_SCRATCH_ALIGN. If - (in_cnt,out_cnt) is not valid, fd_dedup_tile_scratch_footprint - silently returns 0 so callers can diagnose configuration issues. - Otherwise, fd_dedup_tile_scratch_footprint will return the same value - as FD_DEDUP_TILE_SCRATCH_FOOTPRINT. - - A fd_dedup_tile will use the application regions of the fseqs and - cncs for accumulating standard diagnostics in the standard ways. - Except for FD_CNC_DIAG_IN_BACKP, none of the diagnostics are cleared - at boot (as such that they can be accumulated over multiple runs). - Clearing is up to monitoring scripts. It is recommend that inputs - and outputs also use their cnc and fseq application regions similarly - for monitoring simplicity / consistency. - - The lifetime of the cnc, mcaches, fseqs, tcache, rng and scratch used - by this tile should be a superset of this tile's lifetime. While - this tile is running, no other tile should use cnc for its command - and control, modify the tcache, publish into mcache, use the rng for - anything (and the rng should be be seeded distinctly from all other - rngs in the system), or use scratch for anything. This tile will act - as a reliable consumer of in_mcache metadata. This tile uses the - in_fseqs passed to it in the usual consumer ways (e.g. publishing - recent locations in the producers sequence space and updating - consumer oriented diagnostics) and the out_fseqs passed to it in the - usual producer ways (i.e. discovering the location of reliable - consumers in sequence space and updating producer oriented - diagnostics). The in_mcache, in_fseq and out_fseq arrays will not be - used the after the tile has successfully booted (transitioned the cnc - from BOOT to RUN) or returned (e.g. failed to boot), whichever comes - first. */ - -FD_FN_CONST ulong -fd_dedup_tile_scratch_align( void ); - -FD_FN_CONST ulong -fd_dedup_tile_scratch_footprint( ulong in_cnt, - ulong out_cnt ); - int -fd_dedup_tile( fd_cnc_t * cnc, /* Local join to the dedup's command-and-control */ - ulong in_cnt, /* Number of input mcaches to dedup, inputs are indexed [0,in_cnt) */ - fd_frag_meta_t const ** in_mcache, /* in_mcache[in_idx] is the local join to input in_idx's mcache */ - ulong ** in_fseq, /* in_fseq [in_idx] is the local join to input in_idx's fseq */ - fd_tcache_t * tcache, /* Local join to the dedup's unique signature cache */ - fd_frag_meta_t * mcache, /* Local join to the dedup's frag stream output mcache */ - ulong out_cnt, /* Number of reliable consumers, reliable consumers are indexed [0,out_cnt) */ - ulong ** out_fseq, /* out_fseq[out_idx] is the local join to reliable consumer out_idx's fseq */ - ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ - long lazy, /* Lazyiness, <=0 means use a reasonable default */ - fd_rng_t * rng, /* Local join to the rng this dedup should use */ - void * scratch, /* Tile scratch memory */ - double tick_per_ns ); /* Result of fd_tempo_tick_per_ns( NULL ) */ +fd_dedup_tile( fd_cnc_t * cnc, /* Local join to the dedups's command-and-control */ + ulong pid, /* Tile PID for diagnostic purposes */ + ulong in_cnt, /* Number of input mcaches to multiplex, inputs are indexed [0,in_cnt) */ + fd_frag_meta_t const ** in_mcache, /* in_mcache[in_idx] is the local join to input in_idx's mcache */ + ulong ** in_fseq, /* in_fseq [in_idx] is the local join to input in_idx's fseq */ + uchar const ** in_dcache, /* in_dcache[in_idx] is the local join to input in_idx's dcache */ + fd_tcache_t * tcache, /* Local join to the dedup's tcache for deduplicating signatures */ + fd_frag_meta_t * mcache, /* Local join to the dedups's frag stream output mcache */ + uchar * dcache, /* Local join to the dedups's frag stream output dcache */ + ulong out_cnt, /* Number of reliable consumers, reliable consumers are indexed [0,out_cnt) */ + ulong ** out_fseq, /* out_fseq[out_idx] is the local join to reliable consumer out_idx's fseq */ + ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ + long lazy, /* Lazyiness, <=0 means use a reasonable default */ + fd_rng_t * rng, /* Local join to the rng this dedups should use */ + void * scratch ); /* Tile scratch memory */ FD_PROTOTYPES_END #endif /* HEADER_fd_src_disco_dedup_fd_dedup_h */ - diff --git a/src/disco/dedup/fd_dedup_tile.c b/src/disco/dedup/fd_dedup_tile.c deleted file mode 100644 index eb634e6beb..0000000000 --- a/src/disco/dedup/fd_dedup_tile.c +++ /dev/null @@ -1,120 +0,0 @@ -#include "../fd_disco.h" - -#if FD_HAS_HOSTED - -FD_STATIC_ASSERT( FD_DEDUP_TILE_SCRATCH_ALIGN<=FD_SHMEM_HUGE_PAGE_SZ, alignment ); - -int -main( int argc, - char ** argv ) { - fd_boot( &argc, &argv ); - - FD_LOG_NOTICE(( "Init" )); - - char const * _cnc = fd_env_strip_cmdline_cstr ( &argc, &argv, "--cnc", NULL, NULL ); - char const * _in_mcaches = fd_env_strip_cmdline_cstr ( &argc, &argv, "--in-mcaches", NULL, "" ); - char const * _in_fseqs = fd_env_strip_cmdline_cstr ( &argc, &argv, "--in-fseqs", NULL, "" ); - char const * _tcache = fd_env_strip_cmdline_cstr ( &argc, &argv, "--tcache", NULL, NULL ); - char const * _mcache = fd_env_strip_cmdline_cstr ( &argc, &argv, "--mcache", NULL, NULL ); - char const * _out_fseqs = fd_env_strip_cmdline_cstr ( &argc, &argv, "--out-fseqs", NULL, "" ); - ulong cr_max = fd_env_strip_cmdline_ulong( &argc, &argv, "--cr-max", NULL, 0UL ); /* 0 <> use default */ - long lazy = fd_env_strip_cmdline_long ( &argc, &argv, "--lazy", NULL, 0L ); /* <=0 <> use default */ - uint seed = fd_env_strip_cmdline_uint ( &argc, &argv, "--seed", NULL, (uint)(ulong)fd_tickcount() ); - - if( FD_UNLIKELY( !_cnc ) ) FD_LOG_ERR(( "--cnc not specified" )); - FD_LOG_NOTICE(( "Joining --cnc %s", _cnc )); - fd_cnc_t * cnc = fd_cnc_join( fd_wksp_map( _cnc ) ); - if( FD_UNLIKELY( !cnc ) ) FD_LOG_ERR(( "fd_cnc_join failed" )); - - char * _in_mcache[ 256 ]; - ulong in_cnt = fd_cstr_tokenize( _in_mcache, 256UL, (char *)_in_mcaches, ',' ); /* argv is non-const */ - if( FD_UNLIKELY( in_cnt>256UL ) ) FD_LOG_ERR(( "too many --in-mcaches specified for current implementation" )); - - fd_frag_meta_t const * in_mcache[ 256 ]; - for( ulong in_idx=0UL; in_idx256UL ) ) FD_LOG_ERR(( "too many --out-fseqs specified for current implementation" )); - - ulong * out_fseq[ 256 ]; - for( ulong out_idx=0UL; out_idx UINT_MAX ) ) { FD_LOG_WARNING(( "in_cnt too large" )); return 1; } for( ulong in_idx=0UL; in_idxfseq = in_fseq [ in_idx ]; ulong const * this_in_sync = fd_mcache_seq_laddr_const( this_in->mcache ); - this_in->depth = fd_mcache_depth( this_in->mcache ); min_in_depth = fd_ulong_min( min_in_depth, this_in->depth ); - this_in->seq = fd_mcache_seq_query( this_in_sync ); /* FIXME: ALLOW OPTION FOR MANUAL SPECIFICATION? */ - this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + ulong depth = fd_mcache_depth( this_in->mcache ); min_in_depth = fd_ulong_min( min_in_depth, depth ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) { FD_LOG_WARNING(( "in_mcache[%lu] too deep", in_idx )); return 1; } + this_in->depth = (uint)depth; + this_in->idx = (uint)in_idx; + this_in->seq = fd_mcache_seq_query( this_in_sync ); /* FIXME: ALLOW OPTION FOR MANUAL SPECIFICATION? */ + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; @@ -205,23 +210,25 @@ fd_mux_tile( fd_cnc_t * cnc, /* Since cr_avail is decremented everytime a frag is exposed to the outs by the mux, exposed_cnt=cr_max-cr_avail is the number frags - that are currently exposed. Exposed frags can be arbitrarily - distributed over all ins and, in the worst case, could all be - from just one particular in. + that are currently exposed. Similarly there might be up to + cr_filt duplicate frags that were filtered. Exposed frags can be + arbitrarily distributed over all ins and, in the worst case, + could all be from just one particular in. When the mux sends flow control credits to an in, the mux decrements the actual mux's position in sequence space by this upper bound. This guarantees, even in the worst case (all frags - exposed downstream came from the smallest depth in), the ins will - never overrun any outs. It also means that the mux tile doesn't - have to keep track of how exposed frags are distributed over the - ins (simplifying the implementation and increasing performance). + exposed downstream and filtered came from the smallest depth in), + the ins will never overrun any outs. It also means that the mux + tile doesn't have to keep track of how these frags are + distributed over the ins (simplifying the implementation and + increasing performance). This also implies that cr_max must be at most - min(in_mcache[*].depth) such that the mux cannot never expose - more frags from an in than the in itself can cache. It also - can't be larger than depth such that at most_depth frags will - ever be exposed to outs. + min(in_mcache[*].depth) such that the mux cannot expose a frag + from further back than the in itself can cache. It also can't be + larger than depth such that at most_depth frags will ever be + exposed to outs. This further implies that the mux should continuously replenish its credits from the outs whenever it has less than cr_max @@ -256,7 +263,7 @@ fd_mux_tile( fd_cnc_t * cnc, Note that: mux_in_seq = in_seq - mux_lag - + where mux lag is the number of frags behind the mux is from the in and, because of in<>mux flow control, this is in [0,in_cr_max]. Simplifying, we need to insure: @@ -272,14 +279,16 @@ fd_mux_tile( fd_cnc_t * cnc, in_cr_max is positive though, this can never be true. With continuous replenishing, exposed_cnt will always improve as - the outs make progress and this improvement will then always be - reflected all the way to all the ins, preventing deadlock / - livelock situations. (Note this also handles the situation where - mux_lag==in_cr_max as, when exposed count improves, it will make - credits available for publication that the mux can use because it - will see there are fragments ready for publication in the in's - mcache given the positive mux_lag and will advance mux_in_seq - accordingly when it publishes them). + the outs make progress and filt_cnt can always be cleared + whenever exposed_cnt gets to 0. This improvement will then + always be reflected all the way to all the ins, preventing + deadlock / livelock situations. (Note this also handles the + situation like mux_lag==in_cr_max and cr_filt==0 as, when exposed + count improves, it will make credits available for publication + that the mux can use because it will see there are fragments + ready for publication in the in's mcache given the positive + mux_lag and will advance mux_in_seq accordingly when it publishes + them). Since we need to continuously replenish our credits when all the outs aren't full caught up and we want to optimize for the common @@ -304,6 +313,7 @@ fd_mux_tile( fd_cnc_t * cnc, } cr_avail = 0UL; /* Will be initialized by run loop */ + cr_filt = 0UL; out_fseq = (ulong const **)SCRATCH_ALLOC( alignof(ulong const *), out_cnt*sizeof(ulong const *) ); out_slow = (ulong **) SCRATCH_ALLOC( alignof(ulong *), out_cnt*sizeof(ulong *) ); @@ -322,9 +332,9 @@ fd_mux_tile( fd_cnc_t * cnc, if( lazy<=0L ) lazy = fd_tempo_lazy_default( cr_max ); FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy )); - /* Initialize the initial housekeeping event sequence to immediately - update cr_avail on the first run loop iteration and then update - all the ins accordingly. */ + /* Initialize the initial event sequence to immediately update + cr_avail on the first run loop iteration and then update all the + ins accordingly. */ event_cnt = in_cnt + 1UL + out_cnt; event_map = (ushort *)SCRATCH_ALLOC( alignof(ushort), event_cnt*sizeof(ushort) ); @@ -366,13 +376,19 @@ fd_mux_tile( fd_cnc_t * cnc, ulong in_idx = event_idx - out_cnt - 1UL; /* Send flow control credits and drain flow control diagnostics - for this in. Note that there are at most cr_max-cr_avail - frags exposed downstream. We don't know how this is - distributed so we conservatively assume they are all from - this in. FIXME: COULD DO A NUMBER OF TRICKS FOR AN EVEN - TIGHTER BOUND HERE (E.G. EXPLICITLY TRACKING THE NUMBER OF - FRAGS EXPOSED PER UPSTREAM CONSUMER FOR EXAMPLE). */ - fd_mux_tile_in_update( &in[ in_idx ], cr_max-cr_avail ); + for in_idx. At this point, there are at most + exposed_cnt=cr_max-cr_avail frags exposed to reliable + consumers mixed with up to cr_filt frags that got filtered by + the mux. We don't know how these frags were distributed + across all ins but, in the worst case, they all might have + come from the this in. The sequence number of the oldest + exposed frag then is at least cr_max-cr_avail+cr_filt before + the next sequence number the mux expects to receive from + that in (e.g. the mux might have received cr_max-cr_avail + exposed frags first followed by cr_filt frags that got + filtered). */ + + fd_mux_tile_in_update( &in[ in_idx ], cr_max - cr_avail + cr_filt ); } else { /* event_idx==out_cnt, housekeeping event */ @@ -387,9 +403,11 @@ fd_mux_tile( fd_cnc_t * cnc, of execution. */ fd_cnc_heartbeat( cnc, now ); FD_COMPILER_MFENCE(); + if( FD_LIKELY( callbacks->cnc_diag_write ) ) callbacks->cnc_diag_write( ctx, cnc_diag ); cnc_diag[ FD_CNC_DIAG_IN_BACKP ] = cnc_diag_in_backp; cnc_diag[ FD_CNC_DIAG_BACKP_CNT ] += cnc_diag_backp_cnt; FD_COMPILER_MFENCE(); + if( FD_LIKELY( callbacks->cnc_diag_clear ) ) callbacks->cnc_diag_clear( ctx ); cnc_diag_backp_cnt = 0UL; /* Receive command-and-control signals */ @@ -412,12 +430,17 @@ fd_mux_tile( fd_cnc_t * cnc, slowest_out = fd_ulong_if( out_cr_avail1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_mux_tile_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } } /* Reload housekeeping timer */ then = now + (long)fd_tempo_async_reload( rng, async_min ); } + fd_mux_context_t mux = { + .mcache = mcache, + .depth = depth, + .cr_avail = &cr_avail, + .seq = &seq, + }; + + if( FD_LIKELY( callbacks->before_credit ) ) callbacks->before_credit( ctx, &mux ); + /* Check if we are backpressured. If so, count any transition into a backpressured regime and spin to wait for flow control credits to return. We don't do a fully atomic update here as it is only - diagnostic and it will still be correct the usual case where + diagnostic and it will still be correct in the usual case where individual diagnostic counters aren't used by writers in different threads of execution. We only count the transition from not backpressured to backpressured. */ - if( FD_UNLIKELY( !cr_avail ) ) { + if( FD_UNLIKELY( cr_avail<=cr_filt ) ) { cnc_diag_backp_cnt += (ulong)!cnc_diag_in_backp; cnc_diag_in_backp = 1UL; FD_SPIN_PAUSE(); @@ -470,6 +504,8 @@ fd_mux_tile( fd_cnc_t * cnc, } cnc_diag_in_backp = 0UL; + if( FD_LIKELY( callbacks->after_credit ) ) callbacks->after_credit( ctx, &mux ); + /* Select which in to poll next (randomized round robin) */ if( FD_UNLIKELY( !in_cnt ) ) { now = fd_tickcount(); continue; } @@ -482,9 +518,20 @@ fd_mux_tile( fd_cnc_t * cnc, ulong this_in_seq = this_in->seq; fd_frag_meta_t const * this_in_mline = this_in->mline; /* Already at appropriate line for this_in_seq */ - FD_COMPILER_MFENCE(); - ulong seq_found = this_in_mline->seq; - FD_COMPILER_MFENCE(); + __m128i seq_sig = fd_frag_meta_seq_sig_query( this_in_mline ); + ulong seq_found = fd_frag_meta_sse0_seq( seq_sig ); + if( FD_UNLIKELY( callbacks->before_frag ) ) { + int filter; + callbacks->before_frag( ctx, (ulong)this_in->idx, seq, fd_frag_meta_sse0_sig( seq_sig ), &filter ); + if( FD_UNLIKELY( filter ) ) { + if( FD_UNLIKELY( !(flags & FD_MUX_FLAG_COPY) ) ) cr_filt += (ulong)(cr_availseq = this_in_seq; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + now = fd_tickcount(); + continue; + } + } long diff = fd_seq_diff( this_in_seq, seq_found ); if( FD_UNLIKELY( diff ) ) { /* Caught up or overrun, optimize for new frag case */ @@ -498,14 +545,13 @@ fd_mux_tile( fd_cnc_t * cnc, } /* We have a new fragment to mux. Try to load it. This attempt - should always be successful if in producers are honoring our flow - control. Since we can cheaply detect if there are - misconfigurations (should be an L1 cache hit / predictable branch - in the properly configured case), we do so anyway. Note that if - we are on a platform where AVX is atomic, this could be replaced - by a flat AVX load of the metadata and an extraction of the found - sequence number for higher performance. */ - + should always be successful if in producers are honoring our flow + control. Since we can cheaply detect if there are + misconfigurations (should be an L1 cache hit / predictable branch + in the properly configured case), we do so anyway. Note that if + we are on a platform where AVX is atomic, this could be replaced + by a flat AVX load of the metadata and an extraction of the found + sequence number for higher performance. */ FD_COMPILER_MFENCE(); ulong sig = this_in_mline->sig; ulong chunk = (ulong)this_in_mline->chunk; @@ -516,6 +562,9 @@ fd_mux_tile( fd_cnc_t * cnc, ulong seq_test = this_in_mline->seq; FD_COMPILER_MFENCE(); + int filter = 0; + if( FD_LIKELY( callbacks->during_frag ) ) callbacks->during_frag( ctx, (ulong)this_in->idx, sig, chunk, sz, &filter ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { /* Overrun while reading (impossible if this_in honoring our fctl) */ this_in->seq = seq_test; /* Resume from here (probably reasonably current, could query in mcache sync instead) */ this_in->accum[ FD_FSEQ_DIAG_OVRNR_CNT ]++; @@ -524,18 +573,29 @@ fd_mux_tile( fd_cnc_t * cnc, continue; } - /* We have successfully loaded the metadata. Decide whether it - is interesting downstream. If so, publish it. */ + if( FD_LIKELY( !filter ) ) { + /* We have successfully loaded the metadata. Decide whether it + is interesting downstream and publish or filter accordingly. */ - ulong should_filter = 0UL; /* FIXME: FILTERING LOGIC HERE */ + if( FD_LIKELY( callbacks->after_frag ) ) callbacks->after_frag( ctx, &sig, &chunk, &sz, &filter ); + } - if( FD_UNLIKELY( should_filter ) ) now = fd_tickcount(); /* Optimize for forwarding path */ - else { - now = fd_tickcount(); + now = fd_tickcount(); + if( FD_UNLIKELY( filter ) ) { + /* If there are any frags from this in that are currently exposed + downstream, this frag needs to be taken into acount in the flow + control info we send to this in (see note above). Since we do + not track the distribution of the source of exposed frags (or + how filtered frags might be interspersed with them), we do not + know this exactly. But we do not need to for flow control + purposes. If cr_avail==cr_max, we are guaranteed nothing is + exposed at all from this in (because nothing is exposed from + any in). If cr_availseq = this_in_seq; this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); - ulong diag_idx = FD_FSEQ_DIAG_PUB_CNT + should_filter*2UL; + ulong diag_idx = FD_FSEQ_DIAG_PUB_CNT + 2UL*(ulong)filter; this_in->accum[ diag_idx ]++; this_in->accum[ diag_idx+1UL ] += (uint)sz; } @@ -567,5 +627,18 @@ fd_mux_tile( fd_cnc_t * cnc, return 0; } +void +fd_mux_publish( fd_mux_context_t * ctx, + ulong sig, + ulong chunk, + ulong sz, + ulong ctl, + ulong tsorig, + ulong tspub ) { + fd_mcache_publish( ctx->mcache, ctx->depth, *ctx->seq, sig, chunk, sz, ctl, tsorig, tspub ); + (*ctx->cr_avail)--; + *ctx->seq = fd_seq_inc( *ctx->seq, 1UL ); +} + #undef SCRATCH_ALLOC diff --git a/src/disco/mux/fd_mux.h b/src/disco/mux/fd_mux.h index 96552bfdd6..cf99d6ad03 100644 --- a/src/disco/mux/fd_mux.h +++ b/src/disco/mux/fd_mux.h @@ -30,6 +30,46 @@ #define FD_MUX_TILE_IN_MAX FD_FRAG_META_ORIG_MAX #define FD_MUX_TILE_OUT_MAX FD_FRAG_META_ORIG_MAX +/* FD_MUX_FLAG_* are user provided flags specifying how to run the mux + tile. + + FD_MUX_FLAG_DEFAULT + Default mux operating mode. +` + FD_MUX_FLAG_MANUAL_PUBLISH + By default, the mux will automatically publish received frags that + are not filtered to the output mcache. If this flag is set, the + mux does not publish frags, and publishing must be done by the + user provided callbacks. + + Note that it is not safe for a user to publish frags directly to + the mcache which is managed by the mux, as the mux would not know + about them for flow control purposes. Instead, you should publish + by calling fd_mux_publish. If the mux is created with + MANUAL_PUBLISH the mux will still do flow control on reliable + consumers to ensure they are not overrun, other mux properties may + no longer hold, particularly about the interleaving and ordering + of the frags. + + FD_MUX_FLAG_COPY + The mux tile is not zero copy, meaning it copies frag payloads and + does not simply republishes a pointer to the incoming frag payload + on to downstream consumers. Because of this, flow control is less + complicated since we no longer need to track the number of + filtered frags in addition to published ones. If this flag is + set, it means the user promises that all published frags have been + copied and the mux does not need to track filtered ones. + + Practically, to implement frag copying, the caller would need to + either + (a) Pass FD_MUX_FLAG_MANUAL_PUBLISH and call publish manually + on fragments they manage. + (b) Set the opt_chunk in the fd_mux_after_frag_fn callback to + point to a copy of the frag payload. */ +#define FD_MUX_FLAG_DEFAULT 0 +#define FD_MUX_FLAG_MANUAL_PUBLISH 1 +#define FD_MUX_FLAG_COPY 2 + /* FD_MUX_TILE_SCRATCH_{ALIGN,FOOTPRINT} specify the alignment and footprint needed for a mux tile scratch region that can support in_cnt inputs and out_cnt outputs. ALIGN is an integer power of 2 of @@ -51,6 +91,189 @@ alignof(ushort), ((in_cnt)+(out_cnt)+1UL)*sizeof(ushort) ), \ FD_MUX_TILE_SCRATCH_ALIGN ) +/* fd_mux_context_t is an opaque type that is passed to the user + provided callbacks. The user can use this mux object to publish + messages to the downstream consumers by calling fd_mux_publish( mux ) + , where mux is the fd_mux_context_t object passed to the callbacks. + + This is the only supported way of publishing, as the mux needs to + keep housekeeping information related to flow control. + + The user callback should not modify any fields of the mux context. */ + +typedef struct { + fd_frag_meta_t * mcache; + ulong depth; + ulong * cr_avail; + ulong * seq; +} fd_mux_context_t; + +/* fd_mux_before_credit_fn is called every iteration of the mux run loop, + whether there is a new frag ready to receive or not. This callback + is also still invoked even if the mux is backpressured and cannot + read any new fragments while waiting for downstream consumers to + catch up. + + This callback is useful for things that need to occur even if no new + frags are being handled. For example, servicing network connections + could happen here. + + The ctx is a user-provided context object from when the mux tile was + initialized. The mux is the mux which is invoking this callback. + The mux should only be used for calling fd_mux_publish to publish + a fragment to downstream consumers. */ +typedef void (fd_mux_before_credit_fn)( void * ctx, + fd_mux_context_t * mux); + +/* fd_mux_after_credit_fn is called every iteration of the mux run loop, + whether there is a new frag ready to receive or not, except in cases + where the mux is backpressured by a downstream consumer and would not + be able to publish. + + The callback might be used for publishing new fragments to downstream + consumers in the main loop which are not in response to an incoming + fragment. For example, code that collects incoming fragments over + a period of 1 second and joins them together before publishing a + large block fragment downstream, would publish the block here. + + The ctx is a user-provided context object from when the mux tile was + initialized. The mux is the mux which is invoking this callback. + The mux should only be used for calling fd_mux_publish to publish + a fragment to downstream consumers. */ +typedef void (fd_mux_after_credit_fn)( void * ctx, + fd_mux_context_t * mux ); + +/* fd_mux_before_frag_fn is called immediately whenever a new fragment + has been detected that was published by an upstream producer. The + signature and sequence number (sig and seq) provided as arguments + are read atomically from shared memory, so must both match each other + from the published fragment (aka. they will not be torn or partially + overwritten). in_idx is an index in [0, num_ins) indicating which + producer published the fragment. + + No fragment data has been read yet here, nor has other metadata, for + example the size or timestamps of the fragment. Mainly this callback + is useful for deciding whether to filter the fragment based on its + signature. If opt_filter is set to non-zero, the frag will be + skipped completely, no fragment data will be read, and the in will + be advanced so that we now wait for the next fragment. + + The ctx is a user-provided context object from when the mux tile was + initialized. */ +typedef void (fd_mux_before_frag_fn)( void * ctx, + ulong in_idx, + ulong sig, + ulong seq, + int * opt_filter ); + +/* fd_mux_during_frag_fn is called after the mux has received a new frag + from an in, but before the mux has checked that it was overrun. This + callback is not invoked if the mux is backpressured, as it would not + try and read a frag from an in in the first place (instead, leaving + it on the in mcache to backpressure the upstream producer). in_idx + will be the index of the in that the frag was received from. + + If the producer of the frags is respecting flow control, it is safe + to read frag data in any of the callbacks, but it is suggested to + copy or read frag data within this callback, as if the producer does + not respect flow control, the frag may be torn or corrupt due to an + overrun by the reader. If the frag being read from has been + overwritten while this callback is running, the frag will be ignored + and the mux will not call the process function. Instead it will + recover from the overrun and continue with new frags. + + This function cannot fail. If opt_filter is set to non-zero, it + means the frag should be filtered and not passed on to downstream + consumers of the mux. + + The ctx is a user-provided context object from when the mux tile was + initialized. + + sig, chunk, and sz are the respective fields from the mcache fragment + that was received. If the producer is not respecting flow control, + these may be corrupt or torn and should not be trusted. */ + +typedef void (fd_mux_during_frag_fn)( void * ctx, + ulong in_idx, + ulong sig, + ulong chunk, + ulong sz, + int * opt_filter ); + +/* fd_mux_after_frag_fn is called immediately after the + fd_mux_during_frag_fn, along with an additional check that the reader + was not overrun while handling the frag. If the reader was overrun, + the frag is abandoned and this function is not called. This callback + is not invoked if the mux is backpressured, as it would not read a + frag in the first place. It is also not invoked if + fd_mux_during_frag sets opt_filter to non-zero, indicating to filter + the frag. + + You should not read the frag data directly here, as it might still + get overrun, instead it should be copied out of the frag during the + read callback if needed later. + + This function cannot fail. If opt_filter is set to non-zero, it + means the frag should be filtered and not passed on to downstream + consumers of the mux. + + The ctx is a user-provided context object from when the mux tile was + initialized. + + opt_sig, opt_chunk, and opt_sz are the respective fields from the + mcache fragment that was received. The callback can modify these + values to change the sig, chunk, and sz of the outgoing frag that is + being copied to downstream consumers. If the producer is not + respecting flow control, these may be corrupt or torn and should not + be trusted. */ + +typedef void (fd_mux_after_frag_fn)( void * ctx, + ulong * opt_sig, + ulong * opt_chunk, + ulong * opt_sz, + int * opt_filter ); + +/* By convention, the mux tile (and other tiles) use the app data region + of the joined cnc to store overall tile diagnostics like whether they + are backpressured. fd_mux_cnc_diag_write is called back to let the + user add additional diagnostics. This should not touch cnc_app[0] or + cnc_app[1] which are reserved for the mux tile (FD_CNC_DIAG_IN_BACKP + and FD_CNC_DIAG_BACKP_CNT). The user can use cnc_app[2] and beyond, + if such additional space was reserved when the cnc was created. + + fd_mux_cnc_diag_write and fd_mux_cnc_diag_clear are a pair of + functions to support accumulating counters. fd_mux_cnc_diag_write is + called inside a compiler fence to ensure the writes do not get + reordered, which may be important for observers or monitoring tools, + but such a guarantee is not needed when clearing local values in the + ctx. A typical usage for a counter is then to increment from a local + context counter in write(), and then reset the local context counter + to 0 in clear(). + + The ctx is a user-provided context object from when the mux tile was + initialized. */ + +typedef void (fd_mux_cnc_diag_write)( void * ctx, + ulong * cnc_app ); + +typedef void (fd_mux_cnc_diag_clear)( void * ctx ); + +/* fd_mux_callbacks_t will be invoked during mux tile execution, and can + be used to alter behavior of the mux tile from the default of copying + frags from the inputs directly to the outputs. Each of the callbacks + can be NULL, in which case it will not be executed. */ + +typedef struct { + fd_mux_before_credit_fn * before_credit; + fd_mux_after_credit_fn * after_credit; + fd_mux_before_frag_fn * before_frag; + fd_mux_during_frag_fn * during_frag; + fd_mux_after_frag_fn * after_frag; + + fd_mux_cnc_diag_write * cnc_diag_write; + fd_mux_cnc_diag_clear * cnc_diag_clear; +} fd_mux_callbacks_t; + FD_PROTOTYPES_BEGIN /* fd_mux_tile multiplex fragment streams provided through in_cnt @@ -60,14 +283,15 @@ FD_PROTOTYPES_BEGIN implementation as a single slow reliable consumer can backpressure _all_ producers and _all_ other consumers using the mux.) - No frags will be filtered by the multiplexer currently. The order of - frags among a group of streams covered by a single in_mcache will be - preserved. Frags from different groups of streams can be arbitrarily - interleaved (but this makes an extreme best effort to avoid - starvation and minimize slip between different groups of streams). + The order of frags among a group of streams covered by a single + in_mcache will be preserved. Frags from different groups of streams + can be arbitrarily interleaved (but this makes an extreme best effort + to avoid starvation and minimize slip between different groups of + streams). The signature, chunk, sz, ctl and tsorig input fragment metadata will - be unchanged by this tile. + be unchanged by this tile, unless they are modified by the user in a + callback. For seq, the mux tile will resequence the frags from all the mcache's into a new total order consistent with the above. @@ -179,7 +403,7 @@ FD_PROTOTYPES_BEGIN up to monitoring scripts. It is recommend that inputs and outputs also use their cnc and fseq application regions similarly for monitoring simplicity / consistency. - + The lifetime of the cnc, mcaches, fseqs, rng and scratch used by this tile should be a superset of this tile's lifetime. While this tile is running, no other tile should use cnc for its command and control, @@ -204,17 +428,33 @@ fd_mux_tile_scratch_footprint( ulong in_cnt, ulong out_cnt ); int -fd_mux_tile( fd_cnc_t * cnc, /* Local join to the mux's command-and-control */ - ulong in_cnt, /* Number of input mcaches to multiplex, inputs are indexed [0,in_cnt) */ - fd_frag_meta_t const ** in_mcache, /* in_mcache[in_idx] is the local join to input in_idx's mcache */ - ulong ** in_fseq, /* in_fseq [in_idx] is the local join to input in_idx's fseq */ - fd_frag_meta_t * mcache, /* Local join to the mux's frag stream output mcache */ - ulong out_cnt, /* Number of reliable consumers, reliable consumers are indexed [0,out_cnt) */ - ulong ** out_fseq, /* out_fseq[out_idx] is the local join to reliable consumer out_idx's fseq */ - ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ - long lazy, /* Lazyiness, <=0 means use a reasonable default */ - fd_rng_t * rng, /* Local join to the rng this mux should use */ - void * scratch ); /* Tile scratch memory */ +fd_mux_tile( fd_cnc_t * cnc, /* Local join to the mux's command-and-control */ + ulong pid, /* Tile PID for diagnostic purposes */ + ulong flags, /* Any of FD_MUX_FLAGS_* specifying how to run the mux */ + ulong in_cnt, /* Number of input mcaches to multiplex, inputs are indexed [0,in_cnt) */ + fd_frag_meta_t const ** in_mcache, /* in_mcache[in_idx] is the local join to input in_idx's mcache */ + ulong ** in_fseq, /* in_fseq [in_idx] is the local join to input in_idx's fseq */ + fd_frag_meta_t * mcache, /* Local join to the mux's frag stream output mcache */ + ulong out_cnt, /* Number of reliable consumers, reliable consumers are indexed [0,out_cnt) */ + ulong ** out_fseq, /* out_fseq[out_idx] is the local join to reliable consumer out_idx's fseq */ + ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ + long lazy, /* Lazyiness, <=0 means use a reasonable default */ + fd_rng_t * rng, /* Local join to the rng this mux should use */ + void * scratch, /* Tile scratch memory */ + void * ctx, /* User supplied context to be passed to the read and process functions */ + fd_mux_callbacks_t * callbacks ); /* User supplied callbacks to be invoked during mux tile execution */ + +/* If the mux is operating with FD_MUX_FLAG_NO_PUBLISH, the caller can optionally + publish fragments to the consumers themself. To do this, they should call + fd_mux_publish with the mux context provided in the */ +void +fd_mux_publish( fd_mux_context_t * ctx, + ulong sig, + ulong chunk, + ulong sz, + ulong ctl, + ulong tsorig, + ulong tspub ); FD_PROTOTYPES_END diff --git a/src/disco/mux/fd_mux_tile.c b/src/disco/mux/fd_mux_tile.c index c016c2760c..f269f1d3d2 100644 --- a/src/disco/mux/fd_mux_tile.c +++ b/src/disco/mux/fd_mux_tile.c @@ -81,7 +81,8 @@ main( int argc, FD_LOG_NOTICE(( "Run" )); - int err = fd_mux_tile( cnc, in_cnt, in_mcache, in_fseq, mcache, out_cnt, out_fseq, cr_max, lazy, rng, scratch ); + fd_mux_callbacks_t callbacks = {0}; + int err = fd_mux_tile( cnc, 0, FD_MUX_FLAG_DEFAULT, in_cnt, in_mcache, in_fseq, mcache, out_cnt, out_fseq, cr_max, lazy, rng, scratch, NULL, &callbacks ); if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_mux_tile failed (%i)", err )); FD_LOG_NOTICE(( "Fini" )); diff --git a/src/disco/mux/test_mux.c b/src/disco/mux/test_mux.c index 28f15d2c87..58e8d64334 100644 --- a/src/disco/mux/test_mux.c +++ b/src/disco/mux/test_mux.c @@ -275,8 +275,9 @@ mux_tile_main( int argc, fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, cfg->mux_seed, 0UL ) ); - int err = fd_mux_tile( cnc, cfg->tx_cnt, tx_mcache, tx_fseq, mux_mcache, cfg->rx_cnt, rx_fseq, - cfg->mux_cr_max, cfg->mux_lazy, rng, cfg->mux_scratch_mem ); + fd_mux_callbacks_t callbacks = {0}; + int err = fd_mux_tile( cnc, 0, FD_MUX_FLAG_DEFAULT, cfg->tx_cnt, tx_mcache, tx_fseq, mux_mcache, cfg->rx_cnt, rx_fseq, + cfg->mux_cr_max, cfg->mux_lazy, rng, cfg->mux_scratch_mem, NULL, &callbacks ); if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "fd_mux_tile failed (%i)", err )); fd_rng_delete( fd_rng_leave( rng ) ); diff --git a/src/disco/pack/Local.mk b/src/disco/pack/Local.mk new file mode 100644 index 0000000000..af18498ff2 --- /dev/null +++ b/src/disco/pack/Local.mk @@ -0,0 +1,3 @@ +$(call add-hdrs,fd_pack.h) +$(call add-objs,fd_pack,fd_disco) +# $(call make-unit-test,test_pack,test_pack,fd_disco fd_tango fd_util) diff --git a/src/disco/pack/fd_pack.c b/src/disco/pack/fd_pack.c new file mode 100644 index 0000000000..7678529f75 --- /dev/null +++ b/src/disco/pack/fd_pack.c @@ -0,0 +1,234 @@ +#include "fd_pack.h" + +#include "../mux/fd_mux.h" + +#define BLOCK_DURATION_NS (400UL*1000UL*1000UL) + +/* About 1.5 kB on the stack */ +#define FD_PACK_PACK_MAX_OUT (16UL) + +/* 1.5 M cost units, enough for 1 max size transaction */ +const ulong CUS_PER_MICROBLOCK = 1500000UL; + +const float VOTE_FRACTION = 0.75; + +typedef struct { + fd_pack_t * pack; + fd_txn_p_t * cur_slot; + + long block_duration_ticks; + long block_end; + + fd_pack_in_ctx_t * in; + + ulong out_cnt; + ulong ** out_busy; + + void * out_wksp; + ulong out_chunk0; + ulong out_wmark; + ulong out_chunk; +} fd_pack_ctx_t; + +static inline void +before_credit( void * _ctx, + fd_mux_context_t * mux ) { + (void)mux; + + fd_pack_ctx_t * ctx = (fd_pack_ctx_t *)_ctx; + + if( FD_UNLIKELY( ctx->cur_slot ) ) { + /* If we were overrun while processing a frag from an in, then cur_slot + is left dangling and not cleaned up, so clean it up here (by returning + the slot to the pool of free slots). */ + fd_pack_insert_txn_cancel( ctx->pack, ctx->cur_slot ); + ctx->cur_slot = NULL; + } + + /* Are we ready to end the block? */ + + long now = fd_tickcount(); + if( FD_UNLIKELY( (now-ctx->block_end)>=0L ) ) { + fd_pack_end_block( ctx->pack ); + ctx->block_end += ctx->block_duration_ticks; + } +} + +static inline void +after_credit( void * _ctx, + fd_mux_context_t * mux ) { + fd_pack_ctx_t * ctx = (fd_pack_ctx_t *)_ctx; + + /* Is it time to schedule the next microblock? For each banking + thread, if it's not busy... */ + for( ulong i=0UL; iout_cnt; i++ ) { + if( FD_LIKELY( fd_fseq_query( ctx->out_busy[i] ) == *mux->seq ) ) { /* optimize for the case we send a microblock */ + FD_LOG_WARNING(( "out_busy[%lu] is %lu, expected %lu", i, fd_fseq_query( ctx->out_busy[i] ), *mux->seq )); + fd_pack_microblock_complete( ctx->pack, i ); + + void * microblock_dst = fd_chunk_to_laddr( ctx->out_wksp, ctx->out_chunk ); + ulong schedule_cnt = fd_pack_schedule_next_microblock( ctx->pack, CUS_PER_MICROBLOCK, VOTE_FRACTION, i, microblock_dst ); + if( FD_LIKELY( schedule_cnt ) ) { + ulong tspub = (ulong)fd_frag_meta_ts_comp( fd_tickcount() ); + ulong chunk = ctx->out_chunk; + ulong msg_sz = schedule_cnt*sizeof(fd_txn_p_t); + + /* publish with sig=i, banks will filter to only handle frags with their own sig idx */ + fd_mux_publish( mux, i, chunk, msg_sz, 0, 0UL, tspub ); + + ctx->out_chunk = fd_dcache_compact_next( ctx->out_chunk, msg_sz, ctx->out_chunk0, ctx->out_wmark ); + } + } else { + FD_LOG_WARNING(( "out_busy[%lu] is %lu, expected %lu", i, fd_fseq_query( ctx->out_busy[i] ), *mux->seq )); + } + } +} + +/* At this point, we have started receiving frag seq with details in + mline at time now. Speculatively processs it here. */ + +static inline void +during_frag( void * _ctx, + ulong in_idx, + ulong sig, + ulong chunk, + ulong sz, + int * opt_filter ) { + (void)sig; + (void)opt_filter; + + fd_pack_ctx_t * ctx = (fd_pack_ctx_t *)_ctx; + + if( FD_UNLIKELY( chunkin[ in_idx ].chunk0 || chunk>=ctx->in[ in_idx ].wmark || sz > FD_TPU_DCACHE_MTU ) ) + FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu)", chunk, sz, ctx->in[ in_idx ].chunk0, ctx->in[ in_idx ].wmark )); + + ctx->cur_slot = fd_pack_insert_txn_init( ctx->pack ); + + uchar const * dcache_entry = fd_chunk_to_laddr_const( ctx->in[in_idx].wksp, chunk ); + /* Assume that the dcache entry is: + Payload ....... (payload_sz bytes) + 0 or 1 byte of padding (since alignof(fd_txn) is 2) + fd_txn ....... (size computed by fd_txn_footprint) + payload_sz (2B) + mline->sz includes all three fields and the padding */ + ulong payload_sz = *(ushort*)(dcache_entry + sz - sizeof(ushort)); + uchar const * payload = dcache_entry; + fd_txn_t const * txn = (fd_txn_t const *)( dcache_entry + fd_ulong_align_up( payload_sz, 2UL ) ); + fd_memcpy( ctx->cur_slot->payload, payload, payload_sz ); + fd_memcpy( TXN(ctx->cur_slot), txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) ); + ctx->cur_slot->payload_sz = payload_sz; + ctx->cur_slot->meta = sig; + +#if DETAILED_LOGGING + FD_LOG_NOTICE(( "Pack got a packet. Payload size: %lu, txn footprint: %lu", payload_sz, + fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) + )); +#endif +} + +/* After the transaction has been fully received, and we know we were + not overrun while reading it, check if it's a duplicate of a prior + transaction. */ + +static inline void +after_frag( void * _ctx, + ulong * opt_sig, + ulong * opt_chunk, + ulong * opt_sz, + int * opt_filter ) { + (void)opt_sig; + (void)opt_chunk; + (void)opt_sz; + (void)opt_filter; + + fd_pack_ctx_t * ctx = (fd_pack_ctx_t *)_ctx; + + fd_pack_insert_txn_fini( ctx->pack, ctx->cur_slot ); + ctx->cur_slot = NULL; +} + +int +fd_pack_tile( fd_cnc_t * cnc, + ulong pid, + ulong in_cnt, + fd_frag_meta_t const ** in_mcache, + ulong ** in_fseq, + uchar const ** in_dcache, + fd_pack_t * pack, + fd_frag_meta_t * mcache, + uchar * dcache, + ulong out_cnt, + ulong ** out_fseq, + ulong ** out_busy, + ulong cr_max, + long lazy, + fd_rng_t * rng, + void * scratch ) { + fd_pack_ctx_t ctx[1]; + + fd_mux_callbacks_t callbacks[1] = { 0 }; + callbacks->before_credit = before_credit; + callbacks->after_credit = after_credit; + callbacks->during_frag = during_frag; + callbacks->after_frag = after_frag; + + ulong scratch_top = (ulong)scratch; + + do { + if( FD_UNLIKELY( !dcache ) ) { FD_LOG_WARNING(( "NULL dcache" )); return 1; } + if( FD_UNLIKELY( !pack ) ) { FD_LOG_WARNING(( "NULL pack" )); return 1; } + if( FD_UNLIKELY( out_cnt>FD_PACK_PACK_MAX_OUT ) ) { FD_LOG_WARNING(( "pack tile connects to too many banking tiles" )); return 1; } + for( ulong i=0; ipack = pack; + ctx->cur_slot = NULL; + + ctx->block_duration_ticks = (long)(fd_tempo_tick_per_ns( NULL ) * (double)BLOCK_DURATION_NS); + ctx->block_end = fd_tickcount() + ctx->block_duration_ticks; + + ctx->in = (fd_pack_in_ctx_t*)SCRATCH_ALLOC( alignof(fd_pack_in_ctx_t), in_cnt*sizeof(fd_pack_in_ctx_t) ); + for( ulong i=0; iin[i].wksp = fd_wksp_containing( in_dcache[i] ); + ctx->in[i].chunk0 = fd_dcache_compact_chunk0( ctx->in[i].wksp, in_dcache[i] ); + ctx->in[i].wmark = fd_dcache_compact_wmark ( ctx->in[i].wksp, in_dcache[i], FD_TPU_DCACHE_MTU ); + } + + ctx->out_cnt = out_cnt; + ctx->out_busy = out_busy; + + if( FD_UNLIKELY( !fd_dcache_compact_is_safe( fd_wksp_containing( dcache ), dcache, MAX_MICROBLOCK_SZ, fd_mcache_depth( mcache ) ) ) ) { + FD_LOG_WARNING(( "dcache not compatible with wksp base and mcache depth" )); + return 1; + } + ctx->out_wksp = fd_wksp_containing( dcache ); + ctx->out_chunk0 = fd_dcache_compact_chunk0( ctx->out_wksp, dcache ); + ctx->out_wmark = fd_dcache_compact_wmark ( ctx->out_wksp, dcache, MAX_MICROBLOCK_SZ ); + ctx->out_chunk = ctx->out_chunk0; + } while(0); + + return fd_mux_tile( cnc, + pid, + FD_MUX_FLAG_MANUAL_PUBLISH | FD_MUX_FLAG_COPY, + in_cnt, + in_mcache, + in_fseq, + mcache, + out_cnt, + out_fseq, + cr_max, + lazy, + rng, + (void*)fd_ulong_align_up( scratch_top, FD_MUX_TILE_SCRATCH_ALIGN ), + ctx, + callbacks ); +} diff --git a/src/disco/pack/fd_pack.h b/src/disco/pack/fd_pack.h new file mode 100644 index 0000000000..4cec633623 --- /dev/null +++ b/src/disco/pack/fd_pack.h @@ -0,0 +1,55 @@ +#ifndef HEADER_fd_src_disco_pack_fd_pack_h +#define HEADER_fd_src_disco_pack_fd_pack_h + +/* fd_pack provides services to packlicate multiple streams of input + fragments and present them to a mix of reliable and unreliable + consumers as though they were generated by a single multi-stream + producer. + + The pack tile is simply a wrapper around the mux tile, that also + checks the transaction signature field for duplicates and filters + them out. */ + +#include "../fd_disco_base.h" + +#include "../../ballet/pack/fd_pack.h" + +/* in bytes. Defined this way to use the size field of mcache */ +#define MAX_MICROBLOCK_SZ USHORT_MAX + +typedef struct { + void * wksp; + ulong chunk0; + ulong wmark; +} fd_pack_in_ctx_t; + +#define FD_PACK_TILE_SCRATCH_ALIGN (128UL) +#define FD_PACK_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) \ + FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ + alignof(fd_pack_in_ctx_t), (in_cnt)*sizeof(fd_pack_in_ctx_t) ), \ + FD_MUX_TILE_SCRATCH_ALIGN, FD_MUX_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) ), \ + FD_PACK_TILE_SCRATCH_ALIGN ) + +FD_PROTOTYPES_BEGIN + +int +fd_pack_tile( fd_cnc_t * cnc, /* Local join to the packs's command-and-control */ + ulong pid, /* Tile PID for diagnostic purposes */ + ulong in_cnt, /* Number of input mcaches to multiplex, inputs are indexed [0,in_cnt) */ + fd_frag_meta_t const ** in_mcache, /* in_mcache[in_idx] is the local join to input in_idx's mcache */ + ulong ** in_fseq, /* in_fseq [in_idx] is the local join to input in_idx's fseq */ + uchar const ** in_dcache, /* in_dcache[in_idx] is the local join to input in_idx's dcache */ + fd_pack_t * pack, /* Local join to the pack's pack object */ + fd_frag_meta_t * mcache, /* Local join to the packs's frag stream output mcache */ + uchar * dcache, /* Local join to the packs's frag stream output dcache */ + ulong out_cnt, /* Number of reliable consumers, reliable consumers are indexed [0,out_cnt) */ + ulong ** out_fseq, /* out_fseq[out_idx] is the local join to reliable consumer out_idx's fseq */ + ulong ** out_busy, /* out_busy[out_idx] is the local join to reliable consumer out_idx's busy fseq */ + ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ + long lazy, /* Lazyiness, <=0 means use a reasonable default */ + fd_rng_t * rng, /* Local join to the rng this packs should use */ + void * scratch ); /* Tile scratch memory */ + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_disco_pack_fd_pack_h */ diff --git a/src/disco/quic/Local.mk b/src/disco/quic/Local.mk index 5d4eb78544..ab45b805f7 100644 --- a/src/disco/quic/Local.mk +++ b/src/disco/quic/Local.mk @@ -1,5 +1,5 @@ ifdef FD_HAS_OPENSSL $(call add-hdrs,fd_quic.h) -$(call add-objs,fd_quic_tile,fd_disco) +$(call add-objs,fd_quic,fd_disco) $(call make-unit-test,test_quic_tile,test_quic_tile,fd_disco fd_tango fd_ballet fd_quic fd_util) endif diff --git a/src/disco/quic/fd_quic.c b/src/disco/quic/fd_quic.c new file mode 100644 index 0000000000..27460dd96d --- /dev/null +++ b/src/disco/quic/fd_quic.c @@ -0,0 +1,601 @@ +#include "fd_quic.h" + +#include "../mux/fd_mux.h" + +/* fd_quic_msg_ctx_t is the message context of a txn being received by + the QUIC tile over the TPU protocol. It is used to detect dcache + overruns by identifying which QUIC stream is currently bound to a + dcache chunk. An array of fd_quic_msg_ctx_t to fit entries + forms the dcache's app region. + + This is necessary for stream defrag, during which multiple QUIC + streams produce into multiple dcache chunks concurrently. In the + worst case, a defrag is started for every available chunk in the + dcache. When the producer wraps around to the first dcache entry, it + will override the existing defrag process. This overrun is then + safely detected through a change in conn/stream IDs when this + previous defrag process continues. */ + +typedef struct __attribute__((aligned(32UL))) { + ulong conn_id; + ulong stream_id; /* ULONG_MAX marks completed msg */ + uchar * data; /* Points to first byte of dcache entry */ + uint sz; + uint tsorig; +} fd_quic_msg_ctx_t; + +/* When QUIC is being serviced and a transaction is completely received + from the network peer, the completed message will have been written + to the outgoing dcache. The QUIC completion callback will then + append a pointer to this message into a simple queue so that the core + tile code can later publish it the outgoing mcache. */ +#define QUEUE_NAME pubq +#define QUEUE_T fd_quic_msg_ctx_t * +#include "../../util/tmpl/fd_queue_dynamic.c" + +typedef struct { + fd_quic_msg_ctx_t ** pubq; + + fd_mux_context_t * mux; + + fd_quic_t * quic; + const fd_aio_t * quic_rx_aio; + + ushort legacy_transaction_port; /* port for receiving non-QUIC (raw UDP) transactions on*/ + + ulong xsk_aio_cnt; + fd_xsk_aio_t ** xsk_aio; + + ulong inflight_streams; /* number of QUIC network streams currently open, used for flow control */ + ulong conn_cnt; /* count of live connections, put into the cnc for diagnostics */ + ulong conn_seq; /* current quic connection sequence number, put into cnc for idagnostics */ + + void * out_wksp; + uchar * out_dcache_app; + ulong out_chunk0; + ulong out_wmark; + ulong out_chunk; +} fd_quic_ctx_t; + +/* fd_quic_dcache_app_footprint returns the required footprint in bytes + for the QUIC tile's out dcache app region of the given depth. */ + +FD_FN_CONST ulong +fd_quic_dcache_app_footprint( ulong depth ) { + return depth * sizeof(fd_quic_msg_ctx_t); +} + +FD_FN_CONST ulong +fd_quic_tile_scratch_align( void ) { + return FD_QUIC_TILE_SCRATCH_ALIGN; +} + +FD_FN_CONST ulong +fd_quic_tile_scratch_footprint( ulong depth, + ulong in_cnt, + ulong out_cnt ) { + if( FD_UNLIKELY( in_cnt >FD_MUX_TILE_IN_MAX ) ) return 0UL; + if( FD_UNLIKELY( out_cnt>FD_MUX_TILE_OUT_MAX ) ) return 0UL; + ulong scratch_top = 0UL; + + SCRATCH_ALLOC( fd_aio_align(), fd_aio_footprint() ); + SCRATCH_ALLOC( pubq_align(), pubq_footprint( depth ) ); + SCRATCH_ALLOC( fd_mux_tile_scratch_align(), fd_mux_tile_scratch_footprint( in_cnt, out_cnt ) ); + return fd_ulong_align_up( scratch_top, fd_quic_tile_scratch_align() ); +} + +/* This tile always publishes messages downstream, even if there are + no credits available. It ignores the flow control of the downstream + verify tile. This is OK as the verify tile is written to expect + this behavior, and enables the QUIC tile to publish as fast as it + can. It would currently be difficult trying to backpressure further + up the stack to the network itself. */ +static inline void +before_credit( void * _ctx, + fd_mux_context_t * mux ) { + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + + ctx->mux = mux; + + /* Poll network backend */ + for( ulong i=0; ixsk_aio_cnt; i++ ) fd_xsk_aio_service( ctx->xsk_aio[i] ); + + /* Service QUIC clients */ + fd_quic_service( ctx->quic ); + + /* Publish completed messages */ + ulong pub_cnt = pubq_cnt( ctx->pubq ); + for( ulong i=0; ipubq[ i ]; + + if( FD_UNLIKELY( msg->stream_id != ULONG_MAX ) ) + continue; /* overrun */ + + /* Get byte slice backing serialized txn data */ + + uchar * txn = msg->data; + ulong txn_sz = msg->sz; + + FD_TEST( txn_sz<=FD_TPU_MTU ); + + /* At this point dcache only contains raw payload of txn. + Beyond end of txn, but within bounds of msg layout, add a trailer + describing the txn layout. + + [ payload ] (txn_sz bytes) + [ pad-align 2B ] (? bytes) + [ fd_txn_t ] (? bytes) + [ payload_sz ] (2B) */ + + /* Ensure sufficient space to store trailer */ + + void * txn_t = (void *)( fd_ulong_align_up( (ulong)msg->data + txn_sz, 2UL ) ); + if( FD_UNLIKELY( (FD_TPU_DCACHE_MTU - ((ulong)txn_t - (ulong)msg->data)) < (FD_TXN_MAX_SZ+2UL) ) ) { + FD_LOG_WARNING(( "dcache entry too small" )); + continue; + } + + /* Parse transaction */ + + ulong txn_t_sz = fd_txn_parse( txn, txn_sz, txn_t, NULL ); + if( FD_UNLIKELY( !txn_t_sz ) ) { + FD_LOG_DEBUG(( "fd_txn_parse(sz=%lu) failed", txn_sz )); + continue; /* invalid txn (terminate conn?) */ + } + + /* Write payload_sz */ + + ushort * payload_sz = (ushort *)( (ulong)txn_t + txn_t_sz ); + *payload_sz = (ushort)txn_sz; + + /* End of message */ + + void * msg_end = (void *)( (ulong)payload_sz + 2UL ); + + /* Create mcache entry */ + + ulong chunk = fd_laddr_to_chunk( ctx->out_wksp, msg->data ); + ulong sz = (ulong)msg_end - (ulong)msg->data; + ulong sig = 0; /* A non-dummy entry representing a finished transaction */ + ulong ctl = fd_frag_meta_ctl( 0, 1 /* som */, 1 /* eom */, 0 /* err */ ); + ulong tsorig = msg->tsorig; + ulong tspub = fd_frag_meta_ts_comp( fd_tickcount() ); + + FD_TEST( sz<=FD_TPU_DCACHE_MTU ); + fd_mux_publish( mux, sig, chunk, sz, ctl, tsorig, tspub ); + } + pubq_remove_all( ctx->pubq ); + ctx->inflight_streams -= pub_cnt; +} + +static inline void +cnc_diag_write( void * _ctx, ulong * cnc_diag ) { + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + + cnc_diag[ FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT ] = ctx->conn_cnt; + cnc_diag[ FD_QUIC_CNC_DIAG_TPU_CONN_SEQ ] = ctx->conn_seq; +} + +FD_FN_CONST static inline ulong +fd_quic_chunk_idx( ulong chunk0, + ulong chunk ) { + return ((chunk-chunk0)*FD_CHUNK_FOOTPRINT) / fd_ulong_align_up( FD_TPU_DCACHE_MTU, FD_CHUNK_FOOTPRINT ); +} + +/* fd_quic_dcache_msg_ctx returns a pointer to the TPU/QUIC message + context struct for the given dcache app laddr and chunk. app_laddr + points to the first byte of the dcache's app region in the tile's + local address space and has FD_DCACHE_ALIGN alignment (see + fd_dcache_app_laddr()). chunk must be within the valid bounds for + this dcache. */ + +FD_FN_CONST static inline fd_quic_msg_ctx_t * +fd_quic_dcache_msg_ctx( uchar * app_laddr, + ulong chunk0, + ulong chunk ) { + fd_quic_msg_ctx_t * msg_arr = (fd_quic_msg_ctx_t *)app_laddr; + return &msg_arr[ fd_quic_chunk_idx( chunk0, chunk ) ]; +} + +/* quic_now is called by the QUIC engine to get the current timestamp in + UNIX time. */ + +static ulong +quic_now( void * ctx ) { + (void)ctx; + return (ulong)fd_log_wallclock(); +} + +/* Tile-local sequence number for conns */ +static FD_TLS ulong conn_seq = 0UL; + +/* quic_conn_new is invoked by the QUIC engine whenever a new connection + is being established. */ +static void +quic_conn_new( fd_quic_conn_t * conn, + void * _ctx ) { + + conn->local_conn_id = ++conn_seq; + + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + ctx->conn_seq = conn_seq; + ctx->conn_cnt++; +} + +/* quic_conn_final is called back by the QUIC engine whenever a + connection is closed. This could be because it ended gracefully, or + was terminated, or any other reason. */ +static void +quic_conn_final( fd_quic_conn_t * conn, + void * _ctx ) { + (void)conn; + + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + ctx->conn_cnt--; +} + +/* By default the dcache only has headroom for one in-flight fragment, + but QUIC might have many. If we exceed the headroom, we publish a + dummy mcache entry to evict the reader from this fragment we want to + use so we can start using it. + + This is not ideal because if the reader is already done with the + fragment we are writing a useless mcache entry, so we try and do it + only when needed. + + The QUIC receive path might typically execute stream_create, + stream_receive, and stream_notice serially, so it is often the case + that even if we are handling multiple new connections in one receive + batch, the in-flight count remains zero or one. */ + +static inline void +fd_tpu_dummy_dcache( fd_quic_ctx_t * ctx ) { + if( FD_LIKELY( ctx->inflight_streams > 0 ) ) { + ulong ctl = fd_frag_meta_ctl( 0, 1 /* som */, 1 /* eom */, 0 /* err */ ); + ulong tsnow = fd_frag_meta_ts_comp( fd_tickcount() ); + fd_mux_publish( ctx->mux, 1, 0, 0, ctl, tsnow, tsnow ); + } +} + +/* quic_stream_new is called back by the QUIC engine whenever an open + connection creates a new stream, at the time this is called, both the + client and server must have agreed to open the stream. In case the + client has opened this stream, it is assumed that the QUIC + implementation has verified that the client has the necessary stream + quota to do so. */ + +static void +quic_stream_new( fd_quic_stream_t * stream, + void * _ctx, + int type ) { + + (void)type; /* TODO reject bidi streams? */ + + /* Load QUIC state */ + + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + + ulong conn_id = stream->conn->local_conn_id; + ulong stream_id = stream->stream_id; + + /* Allocate new dcache entry */ + + ulong chunk = fd_dcache_compact_next( ctx->out_chunk, FD_TPU_DCACHE_MTU, ctx->out_chunk0, ctx->out_wmark ); + + fd_quic_msg_ctx_t * msg_ctx = fd_quic_dcache_msg_ctx( ctx->out_dcache_app, ctx->out_chunk0, chunk ); + msg_ctx->conn_id = conn_id; + msg_ctx->stream_id = stream_id; + msg_ctx->data = fd_chunk_to_laddr( ctx->out_wksp, chunk ); + msg_ctx->sz = 0U; + msg_ctx->tsorig = (uint)fd_frag_meta_ts_comp( fd_tickcount() ); + + fd_tpu_dummy_dcache( ctx ); + + ctx->inflight_streams += 1; + + /* Wind up for next callback */ + + ctx->out_chunk = chunk; /* Update dcache chunk index */ + stream->context = msg_ctx; /* Update stream dcache entry */ +} + +/* quic_stream_receive is called back by the QUIC engine when any stream + in any connection being serviced receives new data. Currently we + simply copy received data out of the xsk (network device memory) into + a local dcache. */ + +static void +quic_stream_receive( fd_quic_stream_t * stream, + void * stream_ctx, + uchar const * data, + ulong data_sz, + ulong offset, + int fin ) { + + (void)fin; /* TODO instantly publish if offset==0UL && fin */ + + /* Bounds check */ + + /* First check that we won't overflow computing total_sz */ + if( FD_UNLIKELY( offset>UINT_MAX || data_sz>UINT_MAX ) ) { + //fd_quic_stream_close( stream, 0x03 ); /* FIXME fd_quic_stream_close not implemented */ + return; /* oversz stream */ + } + + ulong total_sz = offset+data_sz; + if( FD_UNLIKELY( total_sz>FD_TPU_MTU || total_szconn->local_conn_id; + ulong stream_id = stream->stream_id; + + /* Load existing dcache chunk ctx */ + + fd_quic_msg_ctx_t * msg_ctx = (fd_quic_msg_ctx_t *)stream_ctx; + if( FD_UNLIKELY( msg_ctx->conn_id != conn_id || msg_ctx->stream_id != stream_id ) ) { + //fd_quic_stream_close( stream, 0x03 ); /* FIXME fd_quic_stream_close not implemented */ + FD_LOG_WARNING(( "dcache overflow while demuxing %lu!=%lu %lu!=%lu", conn_id, msg_ctx->conn_id, stream_id, msg_ctx->stream_id )); + return; /* overrun */ + } + + /* Append data into chunk, we know this is valid */ + + FD_TEST( offset+data_sz <= FD_TPU_MTU ); /* paranoia */ + fd_memcpy( msg_ctx->data + offset, data, data_sz ); + FD_TEST( total_sz <= UINT_MAX ); /* paranoia, total_sz<=FD_TPU_MTU above*/ + msg_ctx->sz = (uint)total_sz; +} + +/* quic_stream_notify is called back by the QUIC implementation when a + stream is finished. This could either be because it completed + successfully after reading valid data, or it was closed prematurely + for some other reason. All streams must eventually notify. + + If we see a successful QUIC stream notify, it means we have received + a full transaction and should publish it downstream to be verified + and executed. */ + +static void +quic_stream_notify( fd_quic_stream_t * stream, + void * stream_ctx, + int type ) { + /* Load QUIC state */ + + fd_quic_msg_ctx_t * msg_ctx = (fd_quic_msg_ctx_t *)stream_ctx; + fd_quic_conn_t * conn = stream->conn; + fd_quic_t * quic = conn->quic; + fd_quic_ctx_t * ctx = quic->cb.quic_ctx; /* TODO ugly */ + + if( FD_UNLIKELY( type!=FD_QUIC_NOTIFY_END ) ) { + ctx->inflight_streams -= 1; + return; /* not a successful stream close */ + } + + ulong conn_id = stream->conn->local_conn_id; + ulong stream_id = stream->stream_id; + if( FD_UNLIKELY( msg_ctx->conn_id != conn_id || msg_ctx->stream_id != stream_id ) ) { + ctx->inflight_streams -= 1; + return; /* overrun */ + } + + /* Mark message as completed */ + + msg_ctx->stream_id = ULONG_MAX; + + /* Add to local publish queue */ + + if( FD_UNLIKELY( pubq_full( ctx->pubq ) ) ) { + FD_LOG_WARNING(( "pubq full, dropping" )); + ctx->inflight_streams -= 1; + return; + } + pubq_push( ctx->pubq, msg_ctx ); +} + +/* legacy_stream_notify is called when a non-QUIC transaction is + received, that is, a regular unencrypted UDP packet transaction. For + now both QUIC and non-QUIC transactions are accepted, with traffic + type determined by port. + + UDP transactions must fit in one packet and cannot be fragmented, and + notify here means the entire packet was received. */ + +static void +legacy_stream_notify( void * _ctx, + uchar const * packet, + uint packet_sz ) { + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + + if( FD_UNLIKELY( packet_sz > FD_TPU_MTU ) ) return; + + ulong chunk = fd_dcache_compact_next( ctx->out_chunk, FD_TPU_DCACHE_MTU, ctx->out_chunk0, ctx->out_wmark ); + + fd_quic_msg_ctx_t * msg_ctx = fd_quic_dcache_msg_ctx( ctx->out_dcache_app, ctx->out_chunk0, chunk ); + msg_ctx->conn_id = ULONG_MAX; + msg_ctx->stream_id = ULONG_MAX; + msg_ctx->data = fd_chunk_to_laddr( ctx->out_wksp, chunk ); + msg_ctx->sz = packet_sz; + msg_ctx->tsorig = (uint)fd_frag_meta_ts_comp( fd_tickcount() ); + + fd_tpu_dummy_dcache( ctx ); + + ctx->inflight_streams += 1; + + if( FD_UNLIKELY( pubq_full( ctx->pubq ) ) ) { + FD_LOG_WARNING(( "pubq full, dropping" )); + return; + } + + FD_TEST( packet_sz <= FD_TPU_MTU ); /* paranoia */ + fd_memcpy( msg_ctx->data, packet, packet_sz ); + pubq_push( ctx->pubq, msg_ctx ); + + ctx->out_chunk = chunk; +} + +/* net_rx_aio_send is a callback invoked by aio when new data is + received on an incoming xsk. The xsk might be bound to any interface + or ports, so the purpose of this callback is to determine if the + packet might be a valid transaction, and whether it is QUIC or + non-QUIC (raw UDP) before forwarding to the appropriate handler. + + This callback is supposed to return the number of packets in the + batch which were successfully processed, but we always return + batch_cnt since there is no logic in place to backpressure this far + up the stack there is no sane way to "not handle" an incoming packet. + */ + +static int +net_rx_aio_send( void * _ctx, + fd_aio_pkt_info_t const * batch, + ulong batch_cnt, + ulong * opt_batch_idx, + int flush ) { + fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; + + for( ulong i=0; i packet_end ) ) continue; + + /* Extract IP dest addr and UDP dest port */ + ulong ip_dstaddr = *(uint *)( iphdr+16UL ); + (void) ip_dstaddr; + ushort udp_dstport = *(ushort *)( udp+2UL ); + + uchar const * data = udp + 8U; + uint data_sz = (uint)(packet_end - data); + + if( FD_LIKELY( fd_ushort_bswap( udp_dstport ) == ctx->quic->config.net.listen_udp_port ) ) + fd_aio_send( ctx->quic_rx_aio, batch + i, 1, NULL, flush ); + else if( FD_LIKELY( fd_ushort_bswap( udp_dstport ) == ctx->legacy_transaction_port ) ) + legacy_stream_notify( ctx, data, data_sz ); + else + FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. " + "Only ports %hu and %hu should be configured to forward packets. Do " + "you need to reload the XDP program?", + fd_ushort_bswap( udp_dstport ), ctx->quic->config.net.listen_udp_port, ctx->legacy_transaction_port )); + } + + /* the assumption here at present is that any packet that could not be + processed is simply dropped hence, all packets were consumed */ + if( FD_LIKELY( opt_batch_idx ) ) { + *opt_batch_idx = batch_cnt; + } + + return FD_AIO_SUCCESS; +} + +int +fd_quic_tile( fd_cnc_t * cnc, + ulong pid, + fd_quic_t * quic, + ushort legacy_transaction_port, + ulong xsk_aio_cnt, + fd_xsk_aio_t ** xsk_aio, + fd_frag_meta_t * mcache, + uchar * dcache, + ulong cr_max, + long lazy, + fd_rng_t * rng, + void * scratch ) { + fd_quic_ctx_t ctx[1]; + + fd_mux_callbacks_t callbacks[1] = { 0 }; + callbacks->before_credit = before_credit; + callbacks->cnc_diag_write = cnc_diag_write; + + ulong scratch_top = (ulong)scratch; + + do { + if( FD_UNLIKELY( !quic ) ) { FD_LOG_WARNING(( "NULL quic" )); return 1; } + if( FD_UNLIKELY( !dcache ) ) { FD_LOG_WARNING(( "NULL dcache" )); return 1; } + + quic->cb.conn_new = quic_conn_new; + quic->cb.conn_hs_complete = NULL; + quic->cb.conn_final = quic_conn_final; + quic->cb.stream_new = quic_stream_new; + quic->cb.stream_receive = quic_stream_receive; + quic->cb.stream_notify = quic_stream_notify; + quic->cb.now = quic_now; + quic->cb.now_ctx = NULL; + quic->cb.quic_ctx = ctx; + + if( FD_UNLIKELY( !xsk_aio_cnt ) ) { FD_LOG_WARNING(( "no xsk_aio" )); return 1; } + fd_quic_set_aio_net_tx( quic, fd_xsk_aio_get_tx( xsk_aio[0] ) ); + + if( FD_UNLIKELY( !fd_quic_init( quic ) ) ) { FD_LOG_WARNING(( "fd_quic_init failed" )); return 1; } + fd_aio_t * net_rx_aio = fd_aio_join( fd_aio_new( SCRATCH_ALLOC( fd_aio_align(), fd_aio_footprint() ), ctx, net_rx_aio_send ) ); + + ulong depth = fd_mcache_depth( mcache ); + if( FD_UNLIKELY( !fd_dcache_compact_is_safe( fd_wksp_containing( dcache ), dcache, FD_TPU_DCACHE_MTU, depth ) ) ) { + FD_LOG_WARNING(( "dcache not compatible with wksp base and mcache depth" )); + return 1; + } + + if( FD_UNLIKELY( fd_dcache_app_sz( dcache ) < fd_quic_dcache_app_footprint( depth ) ) ) { + FD_LOG_WARNING(( "dcache app sz too small (min=%lu have=%lu)", + fd_quic_dcache_app_footprint( depth ), + fd_dcache_app_sz( dcache ) )); + return 1; + } + + ctx->out_wksp = fd_wksp_containing( dcache ); + ctx->out_dcache_app = fd_dcache_app_laddr( dcache ); + ctx->out_chunk0 = fd_dcache_compact_chunk0( ctx->out_wksp, dcache ); + ctx->out_wmark = fd_dcache_compact_wmark ( ctx->out_wksp, dcache, FD_TPU_DCACHE_MTU ); + ctx->out_chunk = ctx->out_chunk0; + + ctx->inflight_streams = 0UL; + ctx->conn_cnt = 0UL; + ctx->conn_seq = 0UL; + + ctx->quic = quic; + + ctx->legacy_transaction_port = legacy_transaction_port; + + ctx->xsk_aio_cnt = xsk_aio_cnt; + ctx->xsk_aio = xsk_aio; + ctx->quic_rx_aio = fd_quic_get_aio_net_rx( quic ); + for( ulong i=0; ipubq = pubq_join( pubq_new( SCRATCH_ALLOC( pubq_align(), pubq_footprint( depth ) ), depth ) ); + } while(0); + + return fd_mux_tile( cnc, + pid, + FD_MUX_FLAG_MANUAL_PUBLISH | FD_MUX_FLAG_COPY, + 0, + NULL, + NULL, + mcache, + 0, /* no reliable consumers, verify tiles may be overrun */ + NULL, + cr_max, + lazy, + rng, + (void*)fd_ulong_align_up( scratch_top, FD_MUX_TILE_SCRATCH_ALIGN ), + ctx, + callbacks ); +} diff --git a/src/disco/quic/fd_quic.h b/src/disco/quic/fd_quic.h index 84d5f15629..053a4526c4 100644 --- a/src/disco/quic/fd_quic.h +++ b/src/disco/quic/fd_quic.h @@ -1,9 +1,9 @@ #ifndef HEADER_fd_src_disco_quic_fd_quic_h #define HEADER_fd_src_disco_quic_fd_quic_h -/* fd_quic provides a QUIC server tile. +#include "../fd_disco_base.h" - ### TPU/QUIC +/* fd_quic provides a QUIC server tile. At present, TPU is the only protocol deployed on QUIC. It allows clients to send transactions to block producers (this tile). For @@ -14,123 +14,75 @@ packets. For more information, see the specification: https://github.com/solana-foundation/specs/blob/main/p2p/tpu.md - ### Tango semantics - The fd_quic tile acts as a plain old Tango producer writing to a cnc, - an mcache, and a dcache. The tile will defragment multi-packet - TPU streams coming in from QUIC, such that each mcache/dcache pair - forms a complete txn. This requires the dcache mtu to be at least - that of the largest allowed serialized txn size. + an mcache, and a dcache. The tile will defragment multi-packet TPU + streams coming in from QUIC, such that each mcache/dcache pair forms + a complete txn. This requires the dcache mtu to be at least that of + the largest allowed serialized txn size. To facilitate defragmentation, the fd_quic tile stores non-standard stream information in the dcache's application region. (An array of fd_quic_tpu_msg_ctx_t) - ### Networking - - Each QUIC tile serves a single network device RX queue. Serving - multiple network interfaces or multiple queues (receive side scaling) - requires multiple QUIC tiles. Multi-queue deployments require the use - of flow steering to ensure that each QUIC connection only reaches one - QUIC tile at a time. Flow steering based on UDP/IP source hashing as - frequently implemented by hardware-RSS is a practical mechanism to do - so. */ + Each QUIC tile serves a single network device RX queue, and + optionally a loopback RX queue. Serving multiple network interfaces + or multiple queues aside from loopback (receive side scaling) + requires multiple QUIC tiles. Multi-queue deployments require the + use of flow steering to ensure that each QUIC connection only reaches + one QUIC tile at a time. Flow steering based on UDP/IP source + hashing as frequently implemented by hardware-RSS is a practical + mechanism to do so. */ #include "../fd_disco_base.h" #include "../../tango/quic/fd_quic.h" #include "../../tango/xdp/fd_xdp.h" -#include "../../ballet/txn/fd_txn.h" - -#if FD_HAS_HOSTED - -/* FD_TPU_MTU is the max serialized byte size of a txn sent over TPU. */ -#define FD_TPU_MTU (1232UL) -/* FD_TPU_DCACHE_MTU is the max size of a dcache entry */ -#define FD_TPU_DCACHE_MTU (FD_TPU_MTU + FD_TXN_MAX_SZ + 2UL) +/* An fd_quic_tile will use the cnc application region to accumulate the + following tile specific counters: -/* An fd_quic_tile will use the cnc application region to accumulate - the following tile specific counters: - - CHUNK_IDX is the chunk idx where quic tile should start publishing payloads on boot (ignored if not valid on boot) - TPU_PUB_CNT is the number of txns ingested by the QUIC server - TPU_PUB_SZ is the number of txn bytes ingested by the QUIC server TPU_CONN_LIVE_CNT is the number of currently open QUIC conns - TPU_CONN_SEQ is the sequence number of the last QUIC conn opened + + TPU_CONN_SEQ is the sequence number of the last QUIC conn + opened As such, the cnc app region must be at least 64B in size. - Except for IN_BACKP, none of the diagnostics are cleared at - tile startup (as such that they can be accumulated over multiple - runs). Clearing is up to monitoring scripts. */ - -#define FD_QUIC_CNC_DIAG_CHUNK_IDX ( 6UL) /* On 1st cache line of app region, updated by producer, frequently */ -#define FD_QUIC_CNC_DIAG_TPU_PUB_CNT ( 7UL) /* ", frequently */ -#define FD_QUIC_CNC_DIAG_TPU_PUB_SZ ( 8UL) /* ", frequently */ -#define FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT ( 9UL) /* ", frequently */ -#define FD_QUIC_CNC_DIAG_TPU_CONN_SEQ (10UL) /* ", frequently */ - -/* fd_quic_tpu_msg_ctx_t is the message context of a txn being received - by the QUIC tile over the TPU protocol. It is used to detect dcache - overruns by identifying which QUIC stream is currently bound to a - dcache chunk. An array of fd_quic_tpu_msg_ctx_t to fit entries - forms the dcache's app region. - - This is necessary for stream defrag, during which multiple QUIC - streams produce into multiple dcache chunks concurrently. In the worst - case, a defrag is started for every available chunk in the dcache. - When the producer wraps around to the first dcache entry, it will - override the existing defrag process. This overrun is then safely - detected through a change in conn/stream IDs when this previous defrag - process continues. */ - -struct __attribute__((aligned(32UL))) fd_quic_tpu_msg_ctx { - ulong conn_id; - ulong stream_id; /* ULONG_MAX marks completed msg */ - uchar * data; /* Points to first byte of dcache entry */ - uint sz; - uint tsorig; -}; -typedef struct fd_quic_tpu_msg_ctx fd_quic_tpu_msg_ctx_t; - -/* fd_quic_dcache_app_footprint returns the required footprint in bytes - for the QUIC tile's out dcache app region of the given depth. */ - -FD_FN_CONST static inline ulong -fd_quic_dcache_app_footprint( ulong depth ) { - return depth * sizeof(fd_quic_tpu_msg_ctx_t); -} - -/* FD_QUIC_TILE_SCRATCH_ALIGN specifies the alignment and needed for a - QUIC tile scratch region. ALIGN is an integer power of 2 of at least - double cache line to mitigate various kinds of false sharing. */ + Except for IN_BACKP, none of the diagnostics are cleared at tile + startup (as such that they can be accumulated over multiple runs). + Clearing is up to monitoring scripts. */ + +#define FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT (6UL) /* ", frequently */ +#define FD_QUIC_CNC_DIAG_TPU_CONN_SEQ (7UL) /* ", frequently */ #define FD_QUIC_TILE_SCRATCH_ALIGN (128UL) FD_PROTOTYPES_BEGIN -FD_FN_CONST static inline ulong -fd_quic_tile_scratch_align( void ) { - return FD_QUIC_TILE_SCRATCH_ALIGN; -} +FD_FN_CONST ulong +fd_quic_dcache_app_footprint( ulong depth ); FD_FN_CONST ulong -fd_quic_tile_scratch_footprint( ulong depth ); +fd_quic_tile_scratch_align( void ); + +FD_FN_CONST ulong +fd_quic_tile_scratch_footprint( ulong depth, + ulong in_cnt, + ulong out_cnt ); int -fd_quic_tile( fd_cnc_t * cnc, /* Local join to the tile's command-and-control */ - fd_quic_t * quic, /* QUIC without active join */ - fd_xsk_aio_t * xsk_aio, /* Local join to QUIC XSK aio */ - fd_xsk_aio_t * lo_xsk_aio, /* Local join to QUIC XSK aio for loopback interface */ - fd_frag_meta_t * mcache, /* Local join to the tile's txn output mcache */ - uchar * dcache, /* Local join to the tile's txn output dcache */ - long lazy, /* Laziness, <=0 means use a reasonable default */ - fd_rng_t * rng, /* Local join to the rng this tile should use */ - void * scratch, /* Tile scratch memory */ - double tick_per_ns ); /* Result of fd_tempo_tick_per_ns( NULL ) */ +fd_quic_tile( fd_cnc_t * cnc, /* Local join to the quic's command-and-control */ + ulong pid, /* Tile PID for diagnostic purposes */ + fd_quic_t * quic, /* Local join to the quic's quic context */ + ushort legacy_transaction_port, /* Port to "listen" on for non-QUIC (raw UDP) transactions */ + ulong xsk_aio_cnt, /* Number of xsk_aio producers to poll, indexed [0,xsk_aio_cnt)] */ + fd_xsk_aio_t ** xsk_aio, /* xsk_aio[xsk_aio_idx] is the local join to xsk_aio producer */ + fd_frag_meta_t * mcache, /* Local join to the quic's frag stream output mcache */ + uchar * dcache, /* Local join to the quic's frag stream output dcache */ + ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ + long lazy, /* Lazyiness, <=0 means use a reasonable default */ + fd_rng_t * rng, /* Local join to the rng this quic should use */ + void * scratch ); /* Tile scratch memory */ FD_PROTOTYPES_END -#endif /* FD_HAS_HOSTED */ - -#endif /* HEADER_fd_src_disco_tpu_fd_tpu_h */ +#endif /* HEADER_fd_src_disco_quic_fd_quic_h */ diff --git a/src/disco/quic/fd_quic_tile.c b/src/disco/quic/fd_quic_tile.c deleted file mode 100644 index b5a1eb8b58..0000000000 --- a/src/disco/quic/fd_quic_tile.c +++ /dev/null @@ -1,595 +0,0 @@ -#include "fd_quic.h" - -#if !FD_HAS_HOSTED -#error "fd_quic tile requires FD_HAS_HOSTED" -#endif - -#define SCRATCH_ALLOC( a, s ) (__extension__({ \ - ulong _scratch_alloc = fd_ulong_align_up( scratch_top, (a) ); \ - scratch_top = _scratch_alloc + (s); \ - (void *)_scratch_alloc; \ - })) - -/* dcache app region related ******************************************/ - -FD_FN_CONST static inline ulong -fd_quic_chunk_idx( ulong chunk0, - ulong chunk ) { - return ((chunk-chunk0)*FD_CHUNK_FOOTPRINT) / fd_ulong_align_up( FD_TPU_DCACHE_MTU, FD_CHUNK_FOOTPRINT ); -} - -/* fd_quic_dcache_msg_ctx returns a pointer to the TPU/QUIC message - context struct for the given dcache app laddr and chunk. app_laddr - points to the first byte of the dcache's app region in the tile's - local address space and has FD_DCACHE_ALIGN alignment (see - fd_dcache_app_laddr()). chunk must be within the valid bounds for - this dcache. */ - -FD_FN_CONST static inline fd_quic_tpu_msg_ctx_t * -fd_quic_dcache_msg_ctx( uchar * app_laddr, - ulong chunk0, - ulong chunk ) { - fd_quic_tpu_msg_ctx_t * msg_arr = (fd_quic_tpu_msg_ctx_t *)app_laddr; - return &msg_arr[ fd_quic_chunk_idx( chunk0, chunk ) ]; -} - -/* QUIC context related ***********************************************/ - -/* Local publish queue populated by QUIC service callbacks */ -#define QUEUE_NAME pubq -#define QUEUE_T fd_quic_tpu_msg_ctx_t * -#include "../../util/tmpl/fd_queue_dynamic.c" - -/* fd_quic_tpu_ctx_t is the tile context object provided to callbacks - from fd_quic. */ - -struct fd_quic_tpu_ctx { - /* dcache */ - - uchar * base; /* dcache chunk region */ - uchar * dcache_app; /* dcache app region */ - ulong chunk0; - ulong wmark; - - ulong chunk; /* current dcache chunk idx */ - - /* mcache */ - ulong inflight_streams; - fd_frag_meta_t * mcache; - ulong * seq; - ulong depth; - - /* publish stack */ - - fd_quic_tpu_msg_ctx_t ** pubq; - - /* meta */ - - ulong cnc_diag_tpu_conn_live_cnt; - ulong cnc_diag_tpu_conn_seq; -}; -typedef struct fd_quic_tpu_ctx fd_quic_tpu_ctx_t; - -/* QUIC callbacks *****************************************************/ - -/* Tile-local sequence number for conns */ -static FD_TLS ulong conn_seq = 0UL; - -/* fd_tpu_now implements fd_quic_now_t */ -static ulong -fd_tpu_now( void * ctx ) { - (void)ctx; - return (ulong)fd_log_wallclock(); -} - -/* fd_tpu_conn_create implements fd_quic_cb_conn_new_t */ -static void -fd_tpu_conn_create( fd_quic_conn_t * conn, - void * _ctx ) { - - conn->local_conn_id = ++conn_seq; - - fd_quic_tpu_ctx_t * ctx = (fd_quic_tpu_ctx_t *)_ctx; - ctx->cnc_diag_tpu_conn_seq = conn_seq; - ctx->cnc_diag_tpu_conn_live_cnt++; -} - -/* fd_tpu_conn_destroy implements fd_quic_cb_conn_final_t */ -static void -fd_tpu_conn_destroy( fd_quic_conn_t * conn, - void * _ctx ) { - (void)conn; - - fd_quic_tpu_ctx_t * ctx = (fd_quic_tpu_ctx_t *)_ctx; - ctx->cnc_diag_tpu_conn_live_cnt--; -} - -static void -fd_tpu_dummy_dcache( fd_quic_tpu_ctx_t * ctx ) { - /* By default the dcache only has headroom for one in-flight fragment, but - QUIC might have many. If we exceed the headroom, we publish a dummy - mcache entry to evict the reader from this fragment we want to use so we - can start using it. - - This is not ideal because if the reader is already done with the fragment - we are writing a useless mcache entry, so we try and do it only when - needed. - - The QUIC receive path might typically execute stream_create, - stream_receive, and stream_notice serially, so it is often the case that - even if we are handling multiple new connections in one receive batch, - the in-flight count remains zero or one. */ - if( FD_LIKELY( ctx->inflight_streams > 0 ) ) { - ulong ctl = fd_frag_meta_ctl( 0, 1 /* som */, 1 /* eom */, 0 /* err */ ); - ulong tsnow = fd_frag_meta_ts_comp( fd_tickcount() ); - fd_mcache_publish( ctx->mcache, ctx->depth, *ctx->seq, 1, 0, 0, ctl, tsnow, tsnow ); - *ctx->seq = fd_seq_inc( *ctx->seq, 1UL ); - } -} - -/* fd_tpu_stream_create implements fd_quic_cb_stream_new_t */ -static void -fd_tpu_stream_create( fd_quic_stream_t * stream, - void * _ctx, - int type ) { - - /* At this point, the QUIC client and server have agreed to open a - stream. In case the client has opened this stream, it is assumed - that the QUIC implementation has verified that the client has the - necessary stream quota to do so. */ - - (void)type; /* TODO reject bidi streams? */ - - /* Load QUIC state */ - - fd_quic_tpu_ctx_t * ctx = (fd_quic_tpu_ctx_t *)_ctx; - - ulong conn_id = stream->conn->local_conn_id; - ulong stream_id = stream->stream_id; - - /* Load dcache info */ - - uchar * const base = ctx->base; - uchar * const dcache_app = ctx->dcache_app; - ulong const chunk0 = ctx->chunk0; - ulong const wmark = ctx->wmark; - ulong chunk = ctx->chunk; - - /* Allocate new dcache entry */ - - chunk = fd_dcache_compact_next( chunk, FD_TPU_DCACHE_MTU, chunk0, wmark ); - - fd_quic_tpu_msg_ctx_t * msg_ctx = fd_quic_dcache_msg_ctx( dcache_app, chunk0, chunk ); - msg_ctx->conn_id = conn_id; - msg_ctx->stream_id = stream_id; - msg_ctx->data = fd_chunk_to_laddr( base, chunk ); - msg_ctx->sz = 0U; - msg_ctx->tsorig = (uint)fd_frag_meta_ts_comp( fd_tickcount() ); - - fd_tpu_dummy_dcache( ctx ); - - ctx->inflight_streams += 1; - - /* Wind up for next callback */ - - ctx->chunk = chunk; /* Update dcache chunk index */ - stream->context = msg_ctx; /* Update stream dcache entry */ -} - -void -fd_quic_transaction_receive( fd_quic_t * _ctx, - uchar const * packet, - uint packet_sz ) { - fd_quic_tpu_ctx_t * ctx = _ctx->cb.quic_ctx; - - /* Load dcache info */ - uchar * const base = ctx->base; - uchar * const dcache_app = ctx->dcache_app; - ulong const chunk0 = ctx->chunk0; - ulong const wmark = ctx->wmark; - ulong chunk = ctx->chunk; - - /* Allocate new dcache entry */ - chunk = fd_dcache_compact_next( chunk, FD_TPU_DCACHE_MTU, chunk0, wmark ); - - fd_quic_tpu_msg_ctx_t * msg_ctx = fd_quic_dcache_msg_ctx( dcache_app, chunk0, chunk ); - msg_ctx->conn_id = ULONG_MAX; - msg_ctx->stream_id = ULONG_MAX; - msg_ctx->data = fd_chunk_to_laddr( base, chunk ); - msg_ctx->sz = packet_sz; - msg_ctx->tsorig = (uint)fd_frag_meta_ts_comp( fd_tickcount() ); - - fd_tpu_dummy_dcache( ctx ); - - ctx->inflight_streams += 1; - - /* Add to local publish queue */ - if( FD_UNLIKELY( pubq_full( ctx->pubq ) ) ) { - FD_LOG_WARNING(( "pubq full, dropping" )); - return; - } - - fd_memcpy( msg_ctx->data, packet, packet_sz ); - pubq_push( ctx->pubq, msg_ctx ); - - ctx->chunk = chunk; /* Update dcache chunk index */ -} - -/* fd_tpu_stream_receive implements fd_quic_cb_stream_receive_t */ -static void -fd_tpu_stream_receive( fd_quic_stream_t * stream, - void * stream_ctx, - uchar const * data, - ulong data_sz, - ulong offset, - int fin ) { - - (void)fin; /* TODO instantly publish if offset==0UL && fin */ - - /* Bounds check */ - /* TODO this bounds check is not complete and assumes that the QUIC - implementation rejects obviously invalid offset values, e.g. those - that would overflow the data pointer. */ - - ulong total_sz = offset+data_sz; - if( FD_UNLIKELY( total_sz>FD_TPU_MTU || total_szconn->local_conn_id; - ulong stream_id = stream->stream_id; - - /* Load existing dcache chunk ctx */ - - fd_quic_tpu_msg_ctx_t * msg_ctx = (fd_quic_tpu_msg_ctx_t *)stream_ctx; - if( FD_UNLIKELY( msg_ctx->conn_id != conn_id || msg_ctx->stream_id != stream_id ) ) { - //fd_quic_stream_close( stream, 0x03 ); /* FIXME fd_quic_stream_close not implemented */ - FD_LOG_WARNING(( "dcache overflow while demuxing %lu!=%lu %lu!=%lu", conn_id, msg_ctx->conn_id, stream_id, msg_ctx->stream_id )); - return; /* overrun */ - } - - /* Append data into chunk */ - - fd_memcpy( msg_ctx->data + offset, data, data_sz ); - msg_ctx->sz = (uint)total_sz; -} - -/* fd_tpu_stream_notify implements fd_quic_cb_stream_notify_t */ -static void -fd_tpu_stream_notify( fd_quic_stream_t * stream, - void * stream_ctx, - int type ) { - /* Load QUIC state */ - - fd_quic_tpu_msg_ctx_t * msg_ctx = (fd_quic_tpu_msg_ctx_t *)stream_ctx; - fd_quic_conn_t * conn = stream->conn; - fd_quic_t * quic = conn->quic; - fd_quic_tpu_ctx_t * ctx = quic->cb.quic_ctx; /* TODO ugly */ - - if( FD_UNLIKELY( type!=FD_QUIC_NOTIFY_END ) ) { - ctx->inflight_streams -= 1; - return; /* not a successful stream close */ - } - - ulong conn_id = stream->conn->local_conn_id; - ulong stream_id = stream->stream_id; - if( FD_UNLIKELY( msg_ctx->conn_id != conn_id || msg_ctx->stream_id != stream_id ) ) { - ctx->inflight_streams -= 1; - return; /* overrun */ - } - - /* Mark message as completed */ - - msg_ctx->stream_id = ULONG_MAX; - - /* Add to local publish queue */ - - if( FD_UNLIKELY( pubq_full( ctx->pubq ) ) ) { - FD_LOG_WARNING(( "pubq full, dropping" )); - ctx->inflight_streams -= 1; - return; - } - pubq_push( ctx->pubq, msg_ctx ); -} - -/* Tile ***************************************************************/ - -ulong -fd_quic_tile_scratch_footprint( ulong depth ) { - return pubq_footprint( depth ); -} - -int -fd_quic_tile( fd_cnc_t * cnc, - fd_quic_t * quic, - fd_xsk_aio_t * xsk_aio, - fd_xsk_aio_t * lo_xsk_aio, - fd_frag_meta_t * mcache, - uchar * dcache, - long lazy, - fd_rng_t * rng, - void * scratch, - double tick_per_ns ) { - - /* cnc state */ - ulong * cnc_diag; - ulong cnc_diag_tpu_pub_cnt; - ulong cnc_diag_tpu_pub_sz; - - /* out frag stream state */ - ulong depth; /* ==fd_mcache_depth( mcache ), depth of the mcache / positive integer power of 2 */ - ulong * sync; /* ==fd_mcache_seq_laddr( mcache ), local addr where QUIC mcache sync info is published */ - ulong seq; /* seq QUIC frag sequence number to publish */ - - void * base; /* ==fd_wksp_containing( dcache ), chunk reference address in the tile's local address space */ - ulong chunk0; /* ==fd_dcache_compact_chunk0( base, dcache ) */ - ulong chunk1; /* ==fd_dcache_compact_chunk1( base, dcache ) */ - ulong wmark; /* ==fd_dcache_compact_wmark ( base, dcache, _pkt_max ), packets chunks start in [chunk0,wmark] */ - ulong chunk; /* Chunk where next packet will be written, in [chunk0,wmark] */ - - /* quic context */ - fd_quic_tpu_ctx_t quic_ctx = {0}; - - /* local publish queue */ - fd_quic_tpu_msg_ctx_t ** msg_pubq; - - /* housekeeping state */ - ulong async_min; /* minimum number of ticks between processing a housekeeping event, positive integer power of 2 */ - - ulong mtu = FD_TPU_DCACHE_MTU; - - /* txn parser */ - fd_txn_parse_counters_t txn_parse_counters = {0}; - - do { - - FD_LOG_INFO(( "Booting quic" )); - - if( FD_UNLIKELY( !scratch ) ) { - FD_LOG_WARNING(( "NULL scratch" )); - return 1; - } - - if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)scratch, fd_quic_tile_scratch_align() ) ) ) { - FD_LOG_WARNING(( "misaligned scratch" )); - return 1; - } - - ulong scratch_top = (ulong)scratch; - - /* cnc state init */ - - if( FD_UNLIKELY( !cnc ) ) { FD_LOG_WARNING(( "NULL cnc" )); return 1; } - if( FD_UNLIKELY( fd_cnc_app_sz( cnc )<64UL ) ) { FD_LOG_WARNING(( "cnc app sz must be at least 64" )); return 1; } - if( FD_UNLIKELY( fd_cnc_signal_query( cnc )!=FD_CNC_SIGNAL_BOOT ) ) { FD_LOG_WARNING(( "already booted" )); return 1; } - - cnc_diag = (ulong *)fd_cnc_app_laddr( cnc ); - - cnc_diag_tpu_pub_cnt = 0UL; - cnc_diag_tpu_pub_sz = 0UL; - - /* out frag stream init */ - - if( FD_UNLIKELY( !mcache ) ) { FD_LOG_WARNING(( "NULL mcache" )); return 1; } - depth = fd_mcache_depth ( mcache ); - sync = fd_mcache_seq_laddr( mcache ); - - seq = fd_mcache_seq_query( sync ); - - if( FD_UNLIKELY( !dcache ) ) { FD_LOG_WARNING(( "NULL dcache" )); return 1; } - - base = fd_wksp_containing( dcache ); - if( FD_UNLIKELY( !base ) ) { FD_LOG_WARNING(( "fd_wksp_containing failed" )); return 1; } - - if( FD_UNLIKELY( !fd_dcache_compact_is_safe( base, dcache, mtu, depth ) ) ) { - FD_LOG_WARNING(( "--dcache not compatible with wksp base and --mcache depth" )); - return 1; - } - - if( FD_UNLIKELY( fd_dcache_app_sz( dcache ) < fd_quic_dcache_app_footprint( depth ) ) ) { - FD_LOG_WARNING(( "--dcache app sz too small (min=%lu have=%lu)", - fd_quic_dcache_app_footprint( depth ), - fd_dcache_app_sz( dcache ) )); - return 1; - } - - chunk0 = fd_dcache_compact_chunk0( base, dcache ); - chunk1 = fd_dcache_compact_chunk1( base, dcache ); - wmark = fd_dcache_compact_wmark ( base, dcache, mtu ); - chunk = FD_VOLATILE_CONST( cnc_diag[ FD_QUIC_CNC_DIAG_CHUNK_IDX ] ); - if( FD_UNLIKELY( !((chunk0<=chunk) & (chunk<=wmark)) ) ) { - chunk = chunk0; - FD_LOG_INFO(( "out of bounds cnc chunk index; overriding initial chunk to chunk0" )); - } - - FD_LOG_INFO(( "dcache chunk %lu", chunk )); - FD_LOG_INFO(( "dcache chunk0 %lu", chunk0 )); - FD_LOG_INFO(( "dcache wmark %lu", wmark )); - FD_LOG_INFO(( "dcache chunk1 %lu", chunk1 )); - FD_LOG_INFO(( "dcache max chunk_idx %lu", fd_quic_chunk_idx( chunk0, chunk1 ) )); - - /* local pubq init */ - - msg_pubq = pubq_join( pubq_new( SCRATCH_ALLOC( pubq_align(), pubq_footprint( depth ) ), depth ) ); - if( FD_UNLIKELY( !msg_pubq ) ) { FD_LOG_WARNING(( "pubq join failed" )); return 1; } - - /* quic server init */ - - if( FD_UNLIKELY( !quic ) ) { FD_LOG_WARNING(( "NULL quic" ) ); return 1; } - fd_quic_callbacks_t * quic_cb = &quic->cb; - if( FD_UNLIKELY( !quic_cb ) ) { FD_LOG_WARNING(( "NULL quic callbacks") ); return 1; } - - quic_cb->conn_new = fd_tpu_conn_create; - quic_cb->conn_hs_complete = NULL; - quic_cb->conn_final = fd_tpu_conn_destroy; - quic_cb->stream_new = fd_tpu_stream_create; - quic_cb->stream_notify = fd_tpu_stream_notify; - quic_cb->stream_receive = fd_tpu_stream_receive; - - quic_cb->now = fd_tpu_now; - quic_cb->now_ctx = NULL; - - quic_ctx.base = base; - quic_ctx.dcache_app = fd_dcache_app_laddr( dcache ); - quic_ctx.chunk0 = chunk0; - quic_ctx.wmark = wmark; - quic_ctx.chunk = chunk; - quic_ctx.pubq = msg_pubq; - quic_ctx.cnc_diag_tpu_conn_live_cnt = 0UL; - quic_ctx.seq = &seq; - quic_ctx.mcache = mcache; - quic_ctx.depth = depth; - quic_ctx.inflight_streams = 0UL; - - quic_cb->quic_ctx = &quic_ctx; - - if( FD_UNLIKELY( !fd_quic_init( quic ) ) ) { FD_LOG_WARNING(( "fd_quic_init failed" )); return 1; } - - /* housekeeping init */ - - if( lazy<=0L ) lazy = fd_tempo_lazy_default( depth ); - FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy )); - - async_min = fd_tempo_async_min( lazy, 1UL /*event_cnt*/, (float)tick_per_ns ); - if( FD_UNLIKELY( !async_min ) ) { FD_LOG_WARNING(( "bad lazy" )); return 1; } - - } while(0); - - ulong tx_idx = fd_tile_idx(); - - FD_LOG_INFO(( "running QUIC server" )); - fd_cnc_signal( cnc, FD_CNC_SIGNAL_RUN ); - long then = fd_tickcount(); - long now = then; - for(;;) { - - /* Do housekeeping at a low rate in the background */ - if( FD_UNLIKELY( (now-then)>=0L ) ) { - - /* Send synchronization info */ - fd_mcache_seq_update( sync, seq ); - - fd_cnc_heartbeat( cnc, now ); - FD_COMPILER_MFENCE(); - cnc_diag[ FD_QUIC_CNC_DIAG_CHUNK_IDX ] = chunk; - cnc_diag[ FD_QUIC_CNC_DIAG_TPU_PUB_CNT ] += cnc_diag_tpu_pub_cnt; - cnc_diag[ FD_QUIC_CNC_DIAG_TPU_PUB_SZ ] += cnc_diag_tpu_pub_sz; - cnc_diag[ FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT ] = quic_ctx.cnc_diag_tpu_conn_live_cnt; - cnc_diag[ FD_QUIC_CNC_DIAG_TPU_CONN_SEQ ] = quic_ctx.cnc_diag_tpu_conn_seq; - FD_COMPILER_MFENCE(); - cnc_diag_tpu_pub_cnt = 0UL; - cnc_diag_tpu_pub_sz = 0UL; - - /* Receive command-and-control signals */ - ulong s = fd_cnc_signal_query( cnc ); - if( FD_UNLIKELY( s!=FD_CNC_SIGNAL_RUN ) ) { - if( FD_LIKELY( s==FD_CNC_SIGNAL_HALT ) ) break; - fd_cnc_signal( cnc, FD_CNC_SIGNAL_RUN ); - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - } - - /* Poll network backend */ - fd_xsk_aio_service( xsk_aio ); - if( FD_UNLIKELY( lo_xsk_aio ) ) fd_xsk_aio_service( lo_xsk_aio ); - - /* Service QUIC clients */ - fd_quic_service( quic ); - - /* Update locals */ - chunk = quic_ctx.chunk; - - /* Publish completed messages */ - ulong pub_cnt = pubq_cnt( msg_pubq ); - for( ulong i=0; istream_id != ULONG_MAX ) ) - continue; /* overrun */ - - /* Get byte slice backing serialized txn data */ - - uchar * txn = msg->data; - ulong txn_sz = msg->sz; - - FD_TEST( txn_sz<=1232UL ); - - /* At this point dcache only contains raw payload of txn. - Beyond end of txn, but within bounds of msg layout, add a trailer - describing the txn layout. - - [ payload ] (txn_sz bytes) - [ pad-align 2B ] (? bytes) - [ fd_txn_t ] (? bytes) - [ payload_sz ] (2B) */ - - /* Ensure sufficient space to store trailer */ - - void * txn_t = (void *)( fd_ulong_align_up( (ulong)msg->data + txn_sz, 2UL ) ); - if( FD_UNLIKELY( (mtu - ((ulong)txn_t - (ulong)msg->data)) < (FD_TXN_MAX_SZ+2UL) ) ) { - FD_LOG_WARNING(( "dcache entry too small" )); - continue; - } - - /* Parse transaction */ - - ulong txn_t_sz = fd_txn_parse( txn, txn_sz, txn_t, &txn_parse_counters ); - if( txn_t_sz==0 ) { - FD_LOG_DEBUG(( "fd_txn_parse(sz=%lu) failed", txn_sz )); - continue; /* invalid txn (terminate conn?) */ - } - - /* Write payload_sz */ - - ushort * payload_sz = (ushort *)( (ulong)txn_t + txn_t_sz ); - *payload_sz = (ushort)txn_sz; - - /* End of message */ - - void * msg_end = (void *)( (ulong)payload_sz + 2UL ); - - /* Create mcache entry */ - - ulong chunk = fd_laddr_to_chunk( base, msg->data ); - ulong sz = (ulong)msg_end - (ulong)msg->data; - ulong sig = 0; /* A non-dummy entry representing a finished transaction */ - ulong ctl = fd_frag_meta_ctl( tx_idx, 1 /* som */, 1 /* eom */, 0 /* err */ ); - ulong tsorig = msg->tsorig; - ulong tspub = fd_frag_meta_ts_comp( fd_tickcount() ); - - fd_mcache_publish( mcache, depth, seq, sig, chunk, sz, ctl, tsorig, tspub ); - - /* Windup for the next iteration and accumulate diagnostics */ - - seq = fd_seq_inc( seq, 1UL ); - cnc_diag_tpu_pub_cnt++; - cnc_diag_tpu_pub_sz += sz; - } - pubq_remove_all( msg_pubq ); - quic_ctx.inflight_streams -= pub_cnt; - - now = fd_tickcount(); - } - - do { - - FD_LOG_INFO(( "Halting quic" )); - fd_quic_leave( quic ); - - /* TODO close all open QUIC conns */ - - FD_LOG_INFO(( "Halted quic" )); - fd_cnc_signal( cnc, FD_CNC_SIGNAL_BOOT ); - - } while(0); - - return 0; -} - -#undef SCRATCH_ALLOC diff --git a/src/disco/quic/test_quic_tile.c b/src/disco/quic/test_quic_tile.c index e6a805ae89..cb1ef1d61f 100644 --- a/src/disco/quic/test_quic_tile.c +++ b/src/disco/quic/test_quic_tile.c @@ -8,11 +8,8 @@ #include "../../util/net/fd_ip4.h" #include "../../ballet/base58/fd_base58.h" -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_CHUNK_IDX == 6UL, unit_test ); -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_PUB_CNT == 7UL, unit_test ); -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_PUB_SZ == 8UL, unit_test ); -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT== 9UL, uint_test ); -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_CONN_SEQ ==10UL, unit_test ); +FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT==6UL, uint_test ); +FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_CONN_SEQ ==7UL, unit_test ); FD_STATIC_ASSERT( FD_QUIC_TILE_SCRATCH_ALIGN==128UL, unit_test ); @@ -173,28 +170,30 @@ rx_tile_main( int argc, static int tx_tile_main( int argc, - char ** argv ) { + char ** argv ) { (void)argc; test_cfg_t * cfg = (test_cfg_t *)argv; fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, cfg->tx_seed, 0UL ) ); - ulong scratch_footprint = fd_quic_tile_scratch_footprint( fd_mcache_depth( cfg->tx_mcache ) ); + ulong scratch_footprint = fd_quic_tile_scratch_footprint( fd_mcache_depth( cfg->tx_mcache ), 0, 1 ); void * scratch = fd_alloca( FD_QUIC_TILE_SCRATCH_ALIGN, scratch_footprint ); FD_TEST( scratch ); FD_TEST( !fd_quic_tile( cfg->tx_cnc, + 0, cfg->tx_quic, - cfg->xsk_aio, - NULL, + 0, + 1, + &cfg->xsk_aio, cfg->tx_mcache, cfg->tx_dcache, + 0, cfg->tx_lazy, rng, - scratch, - fd_tempo_tick_per_ns( NULL ) ) ); + scratch ) ); fd_rng_delete( fd_rng_leave( rng ) ); return 0; @@ -366,6 +365,7 @@ int main( int argc, duration, tx_lazy, cfg->tx_seed, rx_lazy )); ulong const * tx_cnc_diag = (ulong const *)fd_cnc_app_laddr( cfg->tx_cnc ); + ulong const * rx_fseq_diag = (ulong const *)fd_fseq_app_laddr_const( cfg->rx_fseq ); long now = fd_log_wallclock(); long next = now; @@ -381,8 +381,8 @@ int main( int argc, FD_COMPILER_MFENCE(); /* FIXME: add RX_FSEQ / TX_FSEQ / RX_CNC / OTHER TX_CNC stats to monitoring, more pretty printing, etc */ - ulong pub_cnt = tx_cnc_diag[ FD_QUIC_CNC_DIAG_TPU_PUB_CNT ]; - ulong pub_sz = tx_cnc_diag[ FD_QUIC_CNC_DIAG_TPU_PUB_SZ ]; + ulong pub_cnt = rx_fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ]; + ulong pub_sz = rx_fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ]; ulong conn_live_cnt = tx_cnc_diag[ FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT ]; ulong conn_seq = tx_cnc_diag[ FD_QUIC_CNC_DIAG_TPU_CONN_SEQ ]; long tps = (long)pub_cnt - last_pub_cnt; diff --git a/src/disco/verify/Local.mk b/src/disco/verify/Local.mk new file mode 100644 index 0000000000..867599947d --- /dev/null +++ b/src/disco/verify/Local.mk @@ -0,0 +1,3 @@ +$(call add-hdrs,fd_verify.h) +$(call add-objs,fd_verify,fd_disco) +# $(call make-unit-test,test_verify,test_verify,fd_disco fd_tango fd_util) diff --git a/src/disco/verify/fd_verify.c b/src/disco/verify/fd_verify.c new file mode 100644 index 0000000000..d5eb3d52bd --- /dev/null +++ b/src/disco/verify/fd_verify.c @@ -0,0 +1,166 @@ +#include "fd_verify.h" + +#include "../mux/fd_mux.h" + +typedef struct { + fd_sha512_t * sha; + + ulong tcache_depth; + ulong tcache_map_cnt; + ulong * tcache_sync; + ulong * tcache_ring; + ulong * tcache_map; + + fd_verify_in_ctx_t * in; + + void * out_wksp; + ulong out_chunk0; + ulong out_wmark; + ulong out_chunk; +} verify_ctx_t; + +/* during_frag is called between pairs for sequence number checks, as + we are reading incoming frags. We don't actually need to copy the + fragment here, see fd_dedup.c for why we do this.*/ + +static inline void +during_frag( void * _ctx, + ulong in_idx, + ulong sig, + ulong chunk, + ulong sz, + int * opt_filter ) { + verify_ctx_t * ctx = (verify_ctx_t *)_ctx; + + /* This is a dummy mcache entry to keep frags from getting overrun, do + not process */ + if( FD_UNLIKELY( sig ) ) { + *opt_filter = 1; + return; + } + + if( FD_UNLIKELY( chunkin[in_idx].chunk0 || chunk>=ctx->in[in_idx].wmark || sz > FD_TPU_DCACHE_MTU ) ) + FD_LOG_WARNING(( "chunk %lu %lu corrupt, not in range [%lu,%lu)", chunk, sz, ctx->in[in_idx].chunk0, ctx->in[in_idx].wmark )); + + uchar * src = (uchar *)fd_chunk_to_laddr( ctx->in[in_idx].wksp, chunk ); + uchar * dst = (uchar *)fd_chunk_to_laddr( ctx->out_wksp, ctx->out_chunk ); + + fd_memcpy( dst, src, sz ); +} + +static inline void +after_frag( void * _ctx, + ulong * opt_sig, + ulong * opt_chunk, + ulong * opt_sz, + int * opt_filter ) { + (void)opt_sig; + + verify_ctx_t * ctx = (verify_ctx_t *)_ctx; + + uchar * udp_payload = (uchar *)fd_chunk_to_laddr( ctx->out_wksp, ctx->out_chunk ); + ushort payload_sz = *(ushort*)(udp_payload + *opt_sz - sizeof(ushort)); + fd_txn_t * txn = (fd_txn_t*) fd_ulong_align_up( (ulong)(udp_payload) + payload_sz, 2UL ); + + ulong const * public_key = (ulong const *)(udp_payload + txn->acct_addr_off); + ulong const * sig = (ulong const *)(udp_payload + txn->signature_off); + uchar const * msg = (uchar const *)(udp_payload + txn->message_off); + ulong msg_sz = (ulong)payload_sz - txn->message_off; + + /* Sig is already effectively a cryptographically secure hash of + public_key/private_key and message and sz. So use this to do a + quick dedup of ha traffic (FIXME: POTENTIAL DOS ATTACK IF + SOMEBODY COULD INTERCEPT TRAFFIC AND SUBMIT PACKETS WITH SAME + PUBLIC KEY, SIG AND GARBAGE MESSAGE AHEAD OF THE TRAFFIC ... + SEEMS UNLKELY AS THEY WOULD EITHER BE BEHIND THE INBOUND OR BE + A MITM THAT COULD JUST DISCARD INBOUND TRAFFIC). */ + + int ha_dup; + FD_TCACHE_INSERT( ha_dup, *ctx->tcache_sync, ctx->tcache_ring, ctx->tcache_depth, ctx->tcache_map, ctx->tcache_map_cnt, *sig ); + if( FD_UNLIKELY( ha_dup ) ) { + *opt_filter = 1; + return; + } + + /* We appear to have a message to verify. So verify it. */ + + *opt_filter = !!fd_ed25519_verify( msg, msg_sz, sig, public_key, ctx->sha ); + if( FD_LIKELY( !*opt_filter ) ) { + *opt_chunk = ctx->out_chunk; + *opt_sig = *sig; + ctx->out_chunk = fd_dcache_compact_next( ctx->out_chunk, *opt_sz, ctx->out_chunk0, ctx->out_wmark ); + } +} + +int +fd_verify_tile( fd_cnc_t * cnc, + ulong pid, + ulong in_cnt, + const fd_frag_meta_t ** in_mcache, + ulong ** in_fseq, + uchar const ** in_dcache, + fd_sha512_t * sha, + fd_tcache_t * tcache, + fd_frag_meta_t * mcache, + uchar * dcache, + ulong out_cnt, + ulong ** out_fseq, + ulong cr_max, + long lazy, + fd_rng_t * rng, + void * scratch ) { + verify_ctx_t ctx[1]; + + fd_mux_callbacks_t callbacks[1] = { 0 }; + callbacks->during_frag = during_frag; + callbacks->after_frag = after_frag; + + ulong scratch_top = (ulong)scratch; + + do { + if( FD_UNLIKELY( !dcache ) ) { FD_LOG_WARNING(( "NULL dcache" )); return 1; } + if( FD_UNLIKELY( !tcache ) ) { FD_LOG_WARNING(( "NULL tcache" )); return 1; } + if( FD_UNLIKELY( !sha ) ) { FD_LOG_WARNING(( "NULL sha" )); return 1; } + + ctx->tcache_depth = fd_tcache_depth ( tcache ); + ctx->tcache_map_cnt = fd_tcache_map_cnt ( tcache ); + ctx->tcache_sync = fd_tcache_oldest_laddr( tcache ); + ctx->tcache_ring = fd_tcache_ring_laddr ( tcache ); + ctx->tcache_map = fd_tcache_map_laddr ( tcache ); + + ctx->sha = sha; + + ctx->in = (fd_verify_in_ctx_t*)SCRATCH_ALLOC( alignof(fd_verify_in_ctx_t), in_cnt*sizeof(fd_verify_in_ctx_t) ); + for( ulong i=0; iin[i].wksp = fd_wksp_containing( in_dcache[i] ); + ctx->in[i].chunk0 = fd_dcache_compact_chunk0( ctx->in[i].wksp, in_dcache[i] ); + ctx->in[i].wmark = fd_dcache_compact_wmark ( ctx->in[i].wksp, in_dcache[i], FD_TPU_DCACHE_MTU ); + } + + ctx->out_wksp = fd_wksp_containing( dcache ); + ctx->out_chunk0 = fd_dcache_compact_chunk0( ctx->out_wksp, dcache ); + ctx->out_wmark = fd_dcache_compact_wmark ( ctx->out_wksp, dcache, FD_TPU_DCACHE_MTU ); + ctx->out_chunk = ctx->out_chunk0; + } while(0); + + return fd_mux_tile( cnc, + pid, + FD_MUX_FLAG_COPY, /* verify copies frags, and does not run zero copy */ + in_cnt, + in_mcache, + in_fseq, + mcache, + out_cnt, + out_fseq, + cr_max, + lazy, + rng, + (void*)fd_ulong_align_up( scratch_top, FD_MUX_TILE_SCRATCH_ALIGN ), + ctx, + callbacks ); +} diff --git a/src/disco/verify/fd_verify.h b/src/disco/verify/fd_verify.h new file mode 100644 index 0000000000..6aefe574b3 --- /dev/null +++ b/src/disco/verify/fd_verify.h @@ -0,0 +1,48 @@ +#ifndef HEADER_fd_src_disco_verify_fd_verify_h +#define HEADER_fd_src_disco_verify_fd_verify_h + +#include "../fd_disco_base.h" + +/* fd_verify_in_ctx_t is a context object for each in (producer) mcache + connected to the verify tile. */ + +typedef struct { + void * wksp; + ulong chunk0; + ulong wmark; +} fd_verify_in_ctx_t; + +/* The verify tile is a wrapper around the mux tile, that also verifies + incoming transaction signatures match the data being signed. Non-matching + transactions are filtered out of the frag stream. */ + +#define FD_VERIFY_TILE_SCRATCH_ALIGN (128UL) +#define FD_VERIFY_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) \ + FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ + alignof(fd_verify_in_ctx_t), (in_cnt)*sizeof(fd_verify_in_ctx_t) ), \ + FD_MUX_TILE_SCRATCH_ALIGN, FD_MUX_TILE_SCRATCH_FOOTPRINT( in_cnt, out_cnt ) ), \ + FD_VERIFY_TILE_SCRATCH_ALIGN ) + +FD_PROTOTYPES_BEGIN + +int +fd_verify_tile( fd_cnc_t * cnc, /* Local join to the verify's command-and-control */ + ulong pid, /* Tile PID for diagnostic purposes */ + ulong in_cnt, /* Number of input mcaches to multiplex, inputs are indexed [0,in_cnt) */ + const fd_frag_meta_t ** in_mcache, /* in_mcache[in_idx] is the local join to input in_idx's mcache */ + ulong ** in_fseq, /* in_fseq [in_idx] is the local join to input in_idx's fseq */ + uchar const ** in_dcache, /* in_dcache[in_idx] is the local join to input in_idx's dcache */ + fd_sha512_t * sha, /* Local join to the verify's sha verifier */ + fd_tcache_t * tcache, /* Local join to the verify's tcache for deduplicating signatures */ + fd_frag_meta_t * mcache, /* Local join to the verify's frag stream output mcache */ + uchar * dcache, /* Local join to the verify's frag stream output dcache */ + ulong out_cnt, /* Number of reliable consumers, reliable consumers are indexed [0,out_cnt) */ + ulong ** out_fseq, /* out_fseq[out_idx] is the local join to reliable consumer out_idx's fseq */ + ulong cr_max, /* Maximum number of flow control credits, 0 means use a reasonable default */ + long lazy, /* Lazyiness, <=0 means use a reasonable default */ + fd_rng_t * rng, /* Local join to the rng this verify should use */ + void * scratch ); /* Tile scratch memory */ + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_disco_verify_fd_verify_h */ diff --git a/src/app/frank/load/fd_frank_verify_synth_load.c b/src/disco/verify/verify_synth_load.c similarity index 93% rename from src/app/frank/load/fd_frank_verify_synth_load.c rename to src/disco/verify/verify_synth_load.c index 5783f4c768..3972c3dd5c 100644 --- a/src/app/frank/load/fd_frank_verify_synth_load.c +++ b/src/disco/verify/verify_synth_load.c @@ -1,9 +1,7 @@ -#include "../fd_frank.h" - #include int -fd_frank_verify_task( int argc, +fd_app_verify_task( int argc, char ** argv ) { (void)argc; fd_log_thread_set( argv[0] ); @@ -14,7 +12,7 @@ fd_frank_verify_task( int argc, char const * pod_gaddr = argv[1]; - /* Load up the configuration for this frank instance */ + /* Load up the configuration for this app instance */ FD_LOG_INFO(( "using configuration in pod %s at path %s", pod_gaddr, cfg_path )); uchar const * pod = fd_wksp_pod_attach( pod_gaddr ); @@ -38,12 +36,12 @@ fd_frank_verify_task( int argc, int in_backp = 1; FD_COMPILER_MFENCE(); - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ] ) = 1UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ] ) = 0UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ] ) = 0UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_IN_BACKP ] ) = 1UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_BACKP_CNT ] ) = 0UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_CNT ] ) = 0UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_SZ ] ) = 0UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_CNT ] ) = 0UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_SZ ] ) = 0UL; FD_COMPILER_MFENCE(); FD_LOG_INFO(( "joining %s.verify.%s.mcache", cfg_path, verify_name )); @@ -229,10 +227,10 @@ fd_frank_verify_task( int argc, /* Send diagnostic info */ fd_cnc_heartbeat( cnc, now ); FD_COMPILER_MFENCE(); - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_CNT ] ) + accum_ha_filt_cnt; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_HA_FILT_SZ ] ) + accum_ha_filt_sz; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_CNT ] ) + accum_sv_filt_cnt; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_SV_FILT_SZ ] ) + accum_sv_filt_sz; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_CNT ] ) + accum_ha_filt_cnt; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_SZ ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_APP_CNC_DIAG_HA_FILT_SZ ] ) + accum_ha_filt_sz; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_CNT ] ) + accum_sv_filt_cnt; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_SZ ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_APP_CNC_DIAG_SV_FILT_SZ ] ) + accum_sv_filt_sz; FD_COMPILER_MFENCE(); accum_ha_filt_cnt = 0UL; accum_ha_filt_sz = 0UL; @@ -250,7 +248,7 @@ fd_frank_verify_task( int argc, cr_avail = fd_fctl_tx_cr_update( fctl, cr_avail, seq ); if( FD_UNLIKELY( in_backp ) ) { if( FD_LIKELY( cr_avail ) ) { - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ] ) = 0UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_IN_BACKP ] ) = 0UL; in_backp = 0; } } @@ -262,8 +260,8 @@ fd_frank_verify_task( int argc, /* Check if we are backpressured */ if( FD_UNLIKELY( !cr_avail ) ) { if( FD_UNLIKELY( !in_backp ) ) { - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_IN_BACKP ] ) = 1UL; - FD_VOLATILE( cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_FRANK_CNC_DIAG_BACKP_CNT ] )+1UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_IN_BACKP ] ) = 1UL; + FD_VOLATILE( cnc_diag[ FD_APP_CNC_DIAG_BACKP_CNT ] ) = FD_VOLATILE_CONST( cnc_diag[ FD_APP_CNC_DIAG_BACKP_CNT ] )+1UL; in_backp = 1; } FD_SPIN_PAUSE(); diff --git a/src/tango/quic/tests/test_quic_client_flood.c b/src/tango/quic/tests/test_quic_client_flood.c index 7dda1456b8..990d79ef0e 100644 --- a/src/tango/quic/tests/test_quic_client_flood.c +++ b/src/tango/quic/tests/test_quic_client_flood.c @@ -119,7 +119,7 @@ run_quic_client( FD_TEST( client_conn->state == FD_QUIC_CONN_STATE_ACTIVE ); /* create and sign fake ref message txns */ - /* generate a message for every possible message size, using code from fd_frank_verify_synth_load */ + /* generate a message for every possible message size, using code from verify_synth_load */ fd_rng_t _rng[ 1 ]; uint seed = (uint)fd_tile_id(); fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, seed, 0UL ) ); diff --git a/src/test/frank-single-transaction.sh b/src/test/single-transaction.sh similarity index 100% rename from src/test/frank-single-transaction.sh rename to src/test/single-transaction.sh