-
Notifications
You must be signed in to change notification settings - Fork 366
solcap: v2 #7337
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
solcap: v2 #7337
Conversation
765f7f5 to
a155373
Compare
ibhatt-jumptrading
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nits so far
src/discof/capture/fd_capture_tile.c
Outdated
| } | ||
| default: | ||
| /* Unknown signal received in message processing */ | ||
| FD_LOG_ERR(( "Unknown signal received in message processing: sig=%lu", (unsigned long)msg_hdr->sig )); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ulong
src/discof/capture/fd_capture_tile.c
Outdated
| // FD_LOG_WARNING(("SOM: in_idx=%lu sig=%u->%u eom=%d", in_idx, msg_hdr->sig, ctx->msg_set_sig, eom)); | ||
| /* Handle file rotation for recent_only mode */ | ||
| if( ctx->recent_only ) { | ||
| if( ctx->recent_file_start_slot == ULONG_MAX ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use brackets for this if statement.
i would also do if( !ctx->recent_only ) return; instead of nesthing this big code block
a155373 to
912bd0f
Compare
Performance Measurements ⏳
|
912bd0f to
9069270
Compare
Performance Measurements ⏳
|
9069270 to
a858a49
Compare
Performance Measurements ⏳
|
a858a49 to
5eb62c9
Compare
Performance Measurements ⏳
|
5eb62c9 to
eaa62ba
Compare
eaa62ba to
3574184
Compare
be74c85 to
9d70540
Compare
Performance Measurements ⏳
|
| run: | | ||
| sudo prlimit --pid $$ --memlock=-1:-1 | ||
| source /opt/${{ matrix.compiler }}/${{ matrix.compiler }}-${{ matrix.compiler-version }}/activate | ||
| DUMP=../dump make run-solcap-tests |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These tests ensured that the solcap infra itself works. Let's make sure we keep an integration test
| Nishk (TODO): Write more docs for capture context | ||
| */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's resolve this todo before merging
| #include "../../flamenco/fd_rwlock.h" | ||
| #include "../../util/fd_util_base.h" | ||
| #include "../../util/log/fd_log.h" | ||
| #include <sys/types.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused import
| struct fd_capture_link { | ||
| const fd_capture_link_vt_t * vt; | ||
| }; | ||
| typedef struct fd_capture_link fd_capture_link_t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a brief comment to each of these types explaining what it does
| typedef struct fd_capture_link_vt fd_capture_link_vt_t; | ||
|
|
||
| struct fd_capture_link { | ||
| const fd_capture_link_vt_t * vt; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| const fd_capture_link_vt_t * vt; | |
| fd_capture_link_vt_t const * vt; |
| fd_pubkey_t pubkey; | ||
| fd_solana_account_meta_t info; | ||
| ulong data_sz; | ||
| fd_hash_t hash; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document what type of hash is this. (First 32 bytes of the lthash?)
|
|
||
|
|
||
| /* If we're processing a message from a different input, skip this | ||
| fragment */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would we not want to repeat this fragment instead of skipping it?
| if( som ) { | ||
| msg_hdr = fd_type_pun( (void *)data ); | ||
| actual_data = (char *)(data + sizeof(fd_solcap_buf_msg_t)); | ||
| actual_data_sz = sz - sizeof(fd_solcap_buf_msg_t); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
missing bounds check, sz can be zero and this can underflow
| */ | ||
| FD_TEST( fsync( ctx->fd ) == 0 ); | ||
| FD_TEST( fsync( next_fd ) == 0 ); | ||
| FD_TEST( ftruncate( next_fd, 0L ) == 0 ); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think fsync is necessary here. ftruncate operates on page cache, like write. fsync doesn't change the file content from a VFS point of view. It just cleans write caches
|
|
||
| if (ctx->msg_set_sig == SOLCAP_WRITE_BANK_PREIMAGE) { | ||
| FD_TEST( fsync(ctx->fd) == 0 ); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove fsync
| #include "../../disco/stem/fd_stem.c" | ||
|
|
||
| fd_topo_run_tile_t fd_tile_capture = { | ||
| .name = "captur", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd call it "solcap" tile since that's the name of the project
| #include "../../tango/fd_tango_base.h" | ||
| #include "../../tango/fseq/fd_fseq.h" | ||
|
|
||
| #include <time.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused import?
| } | ||
|
|
||
| static uint | ||
| valid_slot_range(fd_capture_ctx_t * ctx, ulong slot) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| valid_slot_range(fd_capture_ctx_t * ctx, ulong slot) { | |
| valid_slot_range( fd_capture_ctx_t * ctx, | |
| ulong slot ) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not a fan of the idea of a capture context. It conflates two separate things - The creation of solfuzz test inputs (protosol Protobufs) and solcaps. Let's just have a src/discof/solcap module and then separate logic for creating Protobufs in src/flamenco/runtime/tests
|
|
||
| /* Used currently by solcap to maintain ordering of messages | ||
| this is a hack for v2.0, will change to using txn sigs eventually */ | ||
| ulong capture_txn_idx; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you consider this a hack? I don't think there's anything wrong with letting the replay sequence transactions for internal identification.
| fd_capture_ctx_t * capture_ctx; | ||
| FILE * capture_file; | ||
| fd_capture_ctx_t * capture_ctx; | ||
| FILE * capture_file; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does the replay tile still hold on to a file handle if the writing logic has been moved to the solcap tile?
| memcpy( &exec_msg->txn, txn_p, sizeof(fd_txn_p_t) ); | ||
| exec_msg->bank_idx = task->txn_exec->bank_idx; | ||
| exec_msg->txn_idx = task->txn_exec->txn_idx; | ||
| if ( FD_UNLIKELY( ctx->capture_ctx ) ) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if ( FD_UNLIKELY( ctx->capture_ctx ) ) { | |
| if( FD_UNLIKELY( ctx->capture_ctx ) ) { |
| ulong consumer_tile_idx = fd_topo_find_tile( topo, "captur", 0UL ); | ||
| fd_topo_tile_t * consumer_tile = &topo->tiles[ consumer_tile_idx ]; | ||
| cap_repl_out->fseq = NULL; | ||
| for (ulong j = 0UL; j < consumer_tile->in_cnt; j++ ) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| for (ulong j = 0UL; j < consumer_tile->in_cnt; j++ ) { | |
| for( ulong j = 0UL; j < consumer_tile->in_cnt; j++ ) { |
| #include <stdlib.h> | ||
| #include <stdbool.h> | ||
|
|
||
| #include <bits/stdint-uintn.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
random import, remove
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please configure your editor to not do auto imports, most of them in this PR are wrong
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/firedancer-io/solcap-tools this repo is still empty, therefore we cannot yet remove production debugging tooling. Please revert deletion. Let's add it back when solcap-tools is promoted to stable
| /* 0x10 */ uint meta_sz; | ||
| /* 0x14 */ uint _pad14[3]; | ||
| .solcap is a portable file format for capturing Solana runtime data | ||
| suitable for replay and debugging. It is laid out below: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
solcap is not a file format. This is just a pcapng file capturing solcap packets.
Important to make this distinction so it's clear to the reader that this format will work with existing tools like Wireshark.
| -- ... | ||
| The solcap format is compatible with the pcapng format, allowing for | ||
| easy interoperability with existing tools that support pcapng. The |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not only compatible, it is pcapng format
| Enhanced Packet Block (EPB) - A single solcap message. | ||
| The internal chunk header contains additional metadata about the | ||
| message, used for identifying the message and its position in the | ||
| stream. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are redundant to declare. I would just point to the pcapng spec for the definitions of SHB, IDB, and EPB.
And then briefly mention any restrictions such as:
The IDB uses DLT_USER0 (specify the Ethertype that solcap uses).
EPB is always used.
Max packet size
| - Account Updates | ||
| - Bank Preimages | ||
| The dumping of the exectuion can be done in multiple ways: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
indent
| header allows for the reader to process the message in the correct | ||
| encoding scheme. Currently the list of messages is: | ||
| - Account Updates | ||
| - Bank Preimages |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the most important part of the protocol specification. And should be moved to a separate Markdown specification file. This should specify:
- What are solcap packets / what problems do they solve (brief intro part)
- The conceptual solcap layers (the base layer would probably be a muxing layer that identifies different solcap sub-types)
- Every packet format documented as a C struct
- Every enum (such as different solcap packet types)
This work should be done since solcap aims to be a portable / generic format for structured tracing of Solana validator events. Therefore, the source of truth should be in a specification file, not in the Firedancer code
| [SOLCAP_WRITE_BANK_PREIMAGE] = SOLCAP_WRITE_BANK_PREIMAGE, \ | ||
| [SOLCAP_WRITE_STAKE_REWARD_EVENT] = SOLCAP_WRITE_STAKE_REWARD_EVENT, \ | ||
| [SOLCAP_WRITE_VOTE_ACCOUNT_PAYOUT] = SOLCAP_WRITE_VOTE_ACCOUNT_PAYOUT, \ | ||
| })[x]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I kind of see what this is trying to do, but why not just make this a static ushort const[] = { ... } instead of a define?
| /* 0x20 */ long acc_coff; /* chunk offset to account chunk */ | ||
| /* 0x28 */ ulong _pad28[5]; | ||
| /* 0x50 */ | ||
| typedef struct fd_solcap_chunk_idb_hdr fd_solcap_chunk_idb_hdr_t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To separate layers cleanly, pcapng structs should not be defined in the solcap protocol layer (since pcap is a layer below). Use src/util/net/pcapng instead
| /* 0x18 */ uint32_t original_packet_len; /* original packet length */ | ||
| /* 0x1c */ /* packet data follows immediately after this structure */ | ||
| }; | ||
| typedef struct fd_solcap_chunk_epb_hdr fd_solcap_chunk_epb_hdr_t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here, remove EPB and reuse pcapng header
| struct __attribute__((packed)) fd_solcap_chunk_ftr { | ||
| /* int_hdr + packet_len */ uint32_t block_len_redundant; /* length of the block */ | ||
| }; | ||
| typedef struct fd_solcap_chunk_ftr fd_solcap_chunk_ftr_t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this and reuse pcapng
| fd_pubkey_t stake_acc_addr; | ||
| fd_pubkey_t vote_acc_addr; | ||
| uint commission; | ||
| long vote_rewards; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hidden padding
| ulong payout_epoch; | ||
| ulong reward_epoch; | ||
| ulong inflation_lamports; | ||
| uint128 total_points; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hidden padding
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not portable btw, uint128 has different alignments on different arches (8 vs 16 bytes)
| if( FD_UNLIKELY( err!=0 ) ) return err; \ | ||
| } while(0) | ||
|
|
||
| #include <time.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why time.h?
| FD_LOG_WARNING(( "fseek failed (%d-%s)", errno, strerror( errno ) )); | ||
| return NULL; | ||
| } | ||
| FD_TEST(sizeof(fd_solcap_chunk_idb_hdr_t) == write(fd, &idb_hdr, sizeof(fd_solcap_chunk_idb_hdr_t))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(This is completely fine but would just like to point out to avoid confusion:)
While the protocol specification (in headers) should be cleanly separated out in layers, it's fine to mix all the layers in the writer class for efficiency. It's just important that the definitions themselves are separate if we ever do live streaming (e.g. over InfiniBand) or graduate from pcapng to a more heavy duty data format (like Parquet or whatever). Keeping the transport/container and data layers separate helps with that.
| if( FD_UNLIKELY( !pb_encode( &stream, desc, msg ) ) ) { | ||
| FD_LOG_WARNING(( "pb_encode failed (%s)", PB_GET_ERROR(&stream) )); | ||
| return EPROTO; | ||
| if (padding_needed > 0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (padding_needed > 0) { | |
| if( padding_needed > 0 ) { |
formatting bugs
| return EPROTO; | ||
| if (padding_needed > 0) { | ||
| static const char zeros[4] = {0}; | ||
| FD_TEST(padding_needed == write(fd, zeros, padding_needed)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doing a syscall to write 4 padding bytes seems very inefficient, consider the use of fd_io_buffered
| typedef struct fd_solcap_writer fd_solcap_writer_t; | ||
| /* fd_solcap_writer_t is a writer utility for solcap files. | ||
| Each soclap write function is responsible for encoding and writing |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Each soclap write function is responsible for encoding and writing | |
| Each solcap write function is responsible for encoding and writing |
|
|
||
| struct fd_solcap_writer; | ||
| typedef struct fd_solcap_writer fd_solcap_writer_t; | ||
| /* fd_solcap_writer_t is a writer utility for solcap files. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /* fd_solcap_writer_t is a writer utility for solcap files. | |
| /* fd_solcap_writer_t produces pcapng dumps containing solcap packets. |
| typedef struct fd_solcap_writer { | ||
| int fd; | ||
| ulong stream_goff; | ||
| } fd_solcap_writer_t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
separate struct definition and type def
| fd_solcap_writer_footprint returns a non-zero byte count. */ | ||
| typedef struct fd_solcap_writer { | ||
| int fd; | ||
| ulong stream_goff; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"goff" (global offset) was a solcap v1 thing, I think we can remove this entirely and use the file descriptor's internal offset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The stub was quite useful to retain portability for executing the runtime on a non-hosted target, like an embedded device. But I don't insist on keeping it.
| epoch, | ||
| epoch-1UL, /* FIXME: this is not strictly correct */ | ||
| *rewards_out, | ||
| (fd_w_u128_t){ .ud=points } ); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Solcap v2 should only be merged when it's at feature parity with solcap v1, if solcap v1 is being simultaneously removed.
| runner->solcap_file = fopen( solcap_path, "w" ); | ||
| if( FD_UNLIKELY( !runner->solcap_file ) ) { | ||
| FD_LOG_ERR(( "fopen($FD_SOLCAP=%s) failed (%i-%s)", solcap_path, errno, fd_io_strerror( errno ) )); | ||
| int fd = open( solcap_path, O_WRONLY | O_CREAT | O_TRUNC, 0644 ); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use of file descriptor I/O directly is less efficient than using libc FILE or fd_io_buffered due to lack of buffering / more frequent syscalls
| $(call make-unit-test,test_sol_compat,test_sol_compat,fd_flamenco_test fd_flamenco fd_tango fd_funk fd_ballet fd_util fd_disco,$(SECP256K1_LIBS) $(FLATCC_LIBS)) | ||
| $(call make-shared,libfd_exec_sol_compat.so,fd_sol_compat,fd_flamenco_test fd_flamenco fd_funk fd_ballet fd_util fd_disco,$(SECP256K1_LIBS) $(FLATCC_LIBS) $(SOL_COMPAT_FLAGS)) | ||
| $(call make-unit-test,test_sol_compat,test_sol_compat,fd_flamenco_test fd_flamenco fd_tango fd_funk fd_ballet fd_util fd_disco fd_discof,$(SECP256K1_LIBS) $(FLATCC_LIBS)) | ||
| $(call make-shared,libfd_exec_sol_compat.so,fd_sol_compat,fd_flamenco_test fd_flamenco fd_funk fd_ballet fd_util fd_disco fd_discof,$(SECP256K1_LIBS) $(FLATCC_LIBS) $(SOL_COMPAT_FLAGS)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a layering violation / circular dependency. discof (runtime tiles) is layered on top of flamenco (Firedancer SVM). The SVM should not depend on prod tile code
| struct __attribute__((packed)) fd_solana_account_meta { | ||
| ulong lamports; | ||
| ulong rent_epoch; | ||
| uchar owner[32]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove solana_account_meta and solana_account_hdr from types.json entirely and just make them part of solcap
9d70540 to
8065fae
Compare
No description provided.