diff --git a/encoding/codecv0.go b/encoding/codecv0.go index 0e8b70f..617d34c 100644 --- a/encoding/codecv0.go +++ b/encoding/codecv0.go @@ -430,3 +430,7 @@ func (d *DACodecV0) computeBatchDataHash(chunks []*Chunk, totalL1MessagePoppedBe dataHash := crypto.Keccak256Hash(dataBytes) return dataHash, nil } + +func (d *DACodecV0) CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) { + return batchBytes, nil +} diff --git a/encoding/codecv2.go b/encoding/codecv2.go index fe2d338..1c90f86 100644 --- a/encoding/codecv2.go +++ b/encoding/codecv2.go @@ -154,7 +154,7 @@ func (d *DACodecV2) constructBlobPayload(chunks []*Chunk, maxNumChunksPerBatch i copy(challengePreimage[0:], hash[:]) // blobBytes represents the compressed blob payload (batchBytes) - blobBytes, err := zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err := d.CompressScrollBatchBytes(batchBytes) if err != nil { return nil, common.Hash{}, nil, nil, common.Hash{}, err } @@ -236,7 +236,7 @@ func (d *DACodecV2) EstimateChunkL1CommitBatchSizeAndBlobSize(c *Chunk) (uint64, if err != nil { return 0, 0, fmt.Errorf("failed to construct batch payload in blob: %w", err) } - blobBytes, err := zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err := d.CompressScrollBatchBytes(batchBytes) if err != nil { return 0, 0, fmt.Errorf("failed to compress scroll batch bytes: %w", err) } @@ -249,7 +249,7 @@ func (d *DACodecV2) EstimateBatchL1CommitBatchSizeAndBlobSize(b *Batch) (uint64, if err != nil { return 0, 0, err } - blobBytes, err := zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err := d.CompressScrollBatchBytes(batchBytes) if err != nil { return 0, 0, err } @@ -263,7 +263,7 @@ func (d *DACodecV2) checkCompressedDataCompatibility(chunks []*Chunk) (bool, err if err != nil { return false, fmt.Errorf("failed to construct batch payload in blob: %w", err) } - blobBytes, err := zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err := d.CompressScrollBatchBytes(batchBytes) if err != nil { return false, fmt.Errorf("failed to compress scroll batch bytes: %w", err) } @@ -289,3 +289,8 @@ func (d *DACodecV2) CheckChunkCompressedDataCompatibility(c *Chunk) (bool, error func (d *DACodecV2) CheckBatchCompressedDataCompatibility(b *Batch) (bool, error) { return d.checkCompressedDataCompatibility(b.Chunks) } + +// CompressScrollBatchBytes compresses the batch bytes using zstd compression. +func (d *DACodecV2) CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) { + return zstd.CompressScrollBatchBytesLegacy(batchBytes) +} diff --git a/encoding/codecv4.go b/encoding/codecv4.go index 8ab6d20..396018d 100644 --- a/encoding/codecv4.go +++ b/encoding/codecv4.go @@ -14,8 +14,6 @@ import ( "github.com/scroll-tech/go-ethereum/crypto" "github.com/scroll-tech/go-ethereum/crypto/kzg4844" "github.com/scroll-tech/go-ethereum/log" - - "github.com/scroll-tech/da-codec/encoding/zstd" ) type DACodecV4 struct { @@ -205,7 +203,7 @@ func (d *DACodecV4) constructBlobPayload(chunks []*Chunk, maxNumChunksPerBatch i if enableCompression { // blobBytes represents the compressed blob payload (batchBytes) var err error - blobBytes, err = zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err = d.CompressScrollBatchBytes(batchBytes) if err != nil { return nil, common.Hash{}, nil, nil, common.Hash{}, err } @@ -267,7 +265,7 @@ func (d *DACodecV4) estimateL1CommitBatchSizeAndBlobSize(chunks []*Chunk) (uint6 return 0, 0, fmt.Errorf("failed to compress scroll batch bytes: %w", err) } if enableCompression { - blobBytes, err := zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err := d.CompressScrollBatchBytes(batchBytes) if err != nil { return 0, 0, err } @@ -295,7 +293,7 @@ func (d *DACodecV4) checkCompressedDataCompatibility(chunks []*Chunk) (bool, err if err != nil { return false, fmt.Errorf("failed to construct batch payload in blob: %w", err) } - blobBytes, err := zstd.CompressScrollBatchBytes(batchBytes) + blobBytes, err := d.CompressScrollBatchBytes(batchBytes) if err != nil { return false, fmt.Errorf("failed to compress scroll batch bytes: %w", err) } diff --git a/encoding/codecv7.go b/encoding/codecv7.go index 29ed315..8cacb0c 100644 --- a/encoding/codecv7.go +++ b/encoding/codecv7.go @@ -234,7 +234,7 @@ func (d *DACodecV7) DecodeTxsFromBlob(blob *kzg4844.Blob, chunks []*DAChunkRawTx // If checkLength is true, this function returns if compression is needed based on the compressed data's length, which is used when doing batch bytes encoding. // If checkLength is false, this function returns the result of the compatibility check, which is used when determining the chunk and batch contents. func (d *DACodecV7) checkCompressedDataCompatibility(payloadBytes []byte, checkLength bool) ([]byte, bool, error) { - compressedPayloadBytes, err := zstd.CompressScrollBatchBytes(payloadBytes) + compressedPayloadBytes, err := d.CompressScrollBatchBytes(payloadBytes) if err != nil { return nil, false, fmt.Errorf("failed to compress blob payload: %w", err) } @@ -388,3 +388,8 @@ func (d *DACodecV7) JSONFromBytes(data []byte) ([]byte, error) { return jsonBytes, nil } + +// CompressScrollBatchBytes compresses the batch bytes using zstd compression. +func (d *DACodecV7) CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) { + return zstd.CompressScrollBatchBytesLegacy(batchBytes) +} diff --git a/encoding/codecv8.go b/encoding/codecv8.go index 2e7c83b..1347da3 100644 --- a/encoding/codecv8.go +++ b/encoding/codecv8.go @@ -1,5 +1,31 @@ package encoding +import ( + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + + "github.com/scroll-tech/go-ethereum/common" + "github.com/scroll-tech/go-ethereum/crypto" + "github.com/scroll-tech/go-ethereum/crypto/kzg4844" + "github.com/scroll-tech/go-ethereum/log" + + "github.com/scroll-tech/da-codec/encoding/zstd" +) + +// DACodecV8 uses CompressScrollBatchBytesStandard for compression instead of CompressScrollBatchBytesLegacy. +// +// Note: Due to Go's method receiver behavior, we need to override all methods that call checkCompressedDataCompatibility. +// When a method in DACodecV7 calls d.checkCompressedDataCompatibility(), it will always use DACodecV7's version, +// even if the instance is actually a DACodecV8. Therefore, we must override: +// - checkCompressedDataCompatibility (core method using the new compression) +// - constructBlob (calls checkCompressedDataCompatibility) +// - NewDABatch (calls constructBlob) +// - CheckBatchCompressedDataCompatibility (calls checkCompressedDataCompatibility) +// - estimateL1CommitBatchSizeAndBlobSize (calls checkCompressedDataCompatibility) +// - EstimateChunkL1CommitBatchSizeAndBlobSize (calls estimateL1CommitBatchSizeAndBlobSize) +// - EstimateBatchL1CommitBatchSizeAndBlobSize (calls estimateL1CommitBatchSizeAndBlobSize) type DACodecV8 struct { DACodecV7 } @@ -12,3 +38,176 @@ func NewDACodecV8() *DACodecV8 { }, } } + +// checkCompressedDataCompatibility checks the compressed data compatibility for a batch. +// It constructs a blob payload, compresses the data, and checks the compressed data compatibility. +// flag checkLength indicates whether to check the length of the compressed data against the original data. +// If checkLength is true, this function returns if compression is needed based on the compressed data's length, which is used when doing batch bytes encoding. +// If checkLength is false, this function returns the result of the compatibility check, which is used when determining the chunk and batch contents. +func (d *DACodecV8) checkCompressedDataCompatibility(payloadBytes []byte, checkLength bool) ([]byte, bool, error) { + compressedPayloadBytes, err := d.CompressScrollBatchBytes(payloadBytes) + if err != nil { + return nil, false, fmt.Errorf("failed to compress blob payload: %w", err) + } + + if err = checkCompressedDataCompatibilityV7(compressedPayloadBytes); err != nil { + log.Warn("Compressed data compatibility check failed", "err", err, "payloadBytes", hex.EncodeToString(payloadBytes), "compressedPayloadBytes", hex.EncodeToString(compressedPayloadBytes)) + return nil, false, nil + } + + // check if compressed data is bigger or equal to the original data -> no need to compress + if checkLength && len(compressedPayloadBytes) >= len(payloadBytes) { + log.Warn("Compressed data is bigger or equal to the original data", "payloadBytes", hex.EncodeToString(payloadBytes), "compressedPayloadBytes", hex.EncodeToString(compressedPayloadBytes)) + return nil, false, nil + } + + return compressedPayloadBytes, true, nil +} + +// NewDABatch creates a DABatch including blob from the provided Batch. +func (d *DACodecV8) NewDABatch(batch *Batch) (DABatch, error) { + if len(batch.Blocks) == 0 { + return nil, errors.New("batch must contain at least one block") + } + + if err := checkBlocksBatchVSChunksConsistency(batch); err != nil { + return nil, fmt.Errorf("failed to check blocks batch vs chunks consistency: %w", err) + } + + blob, blobVersionedHash, blobBytes, challengeDigest, err := d.constructBlob(batch) + if err != nil { + return nil, fmt.Errorf("failed to construct blob: %w", err) + } + + daBatch, err := newDABatchV7(d.Version(), batch.Index, blobVersionedHash, batch.ParentBatchHash, blob, blobBytes, challengeDigest) + if err != nil { + return nil, fmt.Errorf("failed to construct DABatch: %w", err) + } + + return daBatch, nil +} + +func (d *DACodecV8) constructBlob(batch *Batch) (*kzg4844.Blob, common.Hash, []byte, common.Hash, error) { + blobBytes := make([]byte, blobEnvelopeV7OffsetPayload) + + payloadBytes, err := d.constructBlobPayload(batch) + if err != nil { + return nil, common.Hash{}, nil, common.Hash{}, fmt.Errorf("failed to construct blob payload: %w", err) + } + + compressedPayloadBytes, enableCompression, err := d.checkCompressedDataCompatibility(payloadBytes, true /* checkLength */) + if err != nil { + return nil, common.Hash{}, nil, common.Hash{}, fmt.Errorf("failed to check batch compressed data compatibility: %w", err) + } + + isCompressedFlag := uint8(0x0) + if enableCompression { + isCompressedFlag = 0x1 + payloadBytes = compressedPayloadBytes + } + + sizeSlice := encodeSize3Bytes(uint32(len(payloadBytes))) + + blobBytes[blobEnvelopeV7OffsetVersion] = uint8(d.Version()) + copy(blobBytes[blobEnvelopeV7OffsetByteSize:blobEnvelopeV7OffsetCompressedFlag], sizeSlice) + blobBytes[blobEnvelopeV7OffsetCompressedFlag] = isCompressedFlag + blobBytes = append(blobBytes, payloadBytes...) + + if len(blobBytes) > maxEffectiveBlobBytes { + log.Error("ConstructBlob: Blob payload exceeds maximum size", "size", len(blobBytes), "blobBytes", hex.EncodeToString(blobBytes)) + return nil, common.Hash{}, nil, common.Hash{}, fmt.Errorf("blob exceeds maximum size: got %d, allowed %d", len(blobBytes), maxEffectiveBlobBytes) + } + + // convert raw data to BLSFieldElements + blob, err := makeBlobCanonical(blobBytes) + if err != nil { + return nil, common.Hash{}, nil, common.Hash{}, fmt.Errorf("failed to convert blobBytes to canonical form: %w", err) + } + + // compute blob versioned hash + c, err := kzg4844.BlobToCommitment(blob) + if err != nil { + return nil, common.Hash{}, nil, common.Hash{}, fmt.Errorf("failed to create blob commitment: %w", err) + } + blobVersionedHash := kzg4844.CalcBlobHashV1(sha256.New(), &c) + + // compute challenge digest for codecv7, different from previous versions, + // the blob bytes are padded to the max effective blob size, which is 131072 / 32 * 31 due to the blob encoding + paddedBlobBytes := make([]byte, maxEffectiveBlobBytes) + copy(paddedBlobBytes, blobBytes) + + challengeDigest := crypto.Keccak256Hash(crypto.Keccak256(paddedBlobBytes), blobVersionedHash[:]) + + return blob, blobVersionedHash, blobBytes, challengeDigest, nil +} + +// CheckBatchCompressedDataCompatibility checks the compressed data compatibility for a batch. +func (d *DACodecV8) CheckBatchCompressedDataCompatibility(b *Batch) (bool, error) { + if len(b.Blocks) == 0 { + return false, errors.New("batch must contain at least one block") + } + + if err := checkBlocksBatchVSChunksConsistency(b); err != nil { + return false, fmt.Errorf("failed to check blocks batch vs chunks consistency: %w", err) + } + + payloadBytes, err := d.constructBlobPayload(b) + if err != nil { + return false, fmt.Errorf("failed to construct blob payload: %w", err) + } + + // This check is only used for sanity checks. If the check fails, it means that the compression did not work as expected. + // rollup-relayer will try popping the last chunk of the batch (or last block of the chunk when in proposing chunks) and try again to see if it works as expected. + // Since length check is used for DA and proving efficiency, it does not need to be checked here. + _, compatible, err := d.checkCompressedDataCompatibility(payloadBytes, false /* checkLength */) + if err != nil { + return false, fmt.Errorf("failed to check batch compressed data compatibility: %w", err) + } + + return compatible, nil +} + +func (d *DACodecV8) estimateL1CommitBatchSizeAndBlobSize(batch *Batch) (uint64, uint64, error) { + if len(batch.Blocks) == 0 { + return 0, 0, errors.New("batch must contain at least one block") + } + + blobBytes := make([]byte, blobEnvelopeV7OffsetPayload) + + payloadBytes, err := d.constructBlobPayload(batch) + if err != nil { + return 0, 0, fmt.Errorf("failed to construct blob payload: %w", err) + } + + compressedPayloadBytes, enableCompression, err := d.checkCompressedDataCompatibility(payloadBytes, true /* checkLength */) + if err != nil { + return 0, 0, fmt.Errorf("failed to check batch compressed data compatibility: %w", err) + } + + if enableCompression { + blobBytes = append(blobBytes, compressedPayloadBytes...) + } else { + blobBytes = append(blobBytes, payloadBytes...) + } + + return blobEnvelopeV7OffsetPayload + uint64(len(payloadBytes)), calculatePaddedBlobSize(uint64(len(blobBytes))), nil +} + +// EstimateChunkL1CommitBatchSizeAndBlobSize estimates the L1 commit batch size and blob size for a single chunk. +func (d *DACodecV8) EstimateChunkL1CommitBatchSizeAndBlobSize(chunk *Chunk) (uint64, uint64, error) { + return d.estimateL1CommitBatchSizeAndBlobSize(&Batch{ + Blocks: chunk.Blocks, + PrevL1MessageQueueHash: chunk.PrevL1MessageQueueHash, + PostL1MessageQueueHash: chunk.PostL1MessageQueueHash, + }) +} + +// EstimateBatchL1CommitBatchSizeAndBlobSize estimates the L1 commit batch size and blob size for a batch. +func (d *DACodecV8) EstimateBatchL1CommitBatchSizeAndBlobSize(batch *Batch) (uint64, uint64, error) { + return d.estimateL1CommitBatchSizeAndBlobSize(batch) +} + +// CompressScrollBatchBytes compresses the batch bytes using zstd compression. +func (d *DACodecV8) CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) { + return zstd.CompressScrollBatchBytesStandard(batchBytes) +} diff --git a/encoding/codecv8_test.go b/encoding/codecv8_test.go index 79e64b7..bd075ba 100644 --- a/encoding/codecv8_test.go +++ b/encoding/codecv8_test.go @@ -2,10 +2,13 @@ package encoding import ( "encoding/hex" + "math/big" "strings" "testing" + "github.com/agiledragon/gomonkey/v2" "github.com/scroll-tech/go-ethereum/common" + "github.com/scroll-tech/go-ethereum/common/hexutil" "github.com/scroll-tech/go-ethereum/core/types" "github.com/stretchr/testify/require" ) @@ -267,3 +270,119 @@ func TestCodecV8BlobEncodingAndHashing(t *testing.T) { }) } } +func TestCodecV8BatchStandardTestCasesEnableCompression(t *testing.T) { + codecV8, err := CodecFromVersion(CodecV8) + require.NoError(t, err) + + // Apply patches to functions to replace behavior for testing. + { + patches := gomonkey.NewPatches() + defer patches.Reset() + + patches.ApplyFunc(convertTxDataToRLPEncoding, func(txData *types.TransactionData) ([]byte, error) { + data, err := hexutil.Decode(txData.Data) + if err != nil { + return nil, err + } + return data, nil + }) + + patches.ApplyFunc(checkCompressedDataCompatibility, func(_ []byte) error { + return nil + }) + } + + maxAvailableBytesIncompressable := maxEffectiveBlobBytes - 5 - blobPayloadV7MinEncodedLength + // 52 bytes for each block as per daBlockV7 encoding. + bytesPerBlock := 52 + + testCases := []struct { + name string + numBlocks int + txData []string + creationErr string + + expectedBlobVersionedHash string + }{ + { + name: "no blocks", + txData: []string{}, + creationErr: "no blocks", + }, + { + name: "single block, single tx", + numBlocks: 1, + txData: []string{"0x010203"}, + expectedBlobVersionedHash: "0x0184fd3d7edf3ea50c76c1751fcc0c4b605ef1f8e7c434ec1c1a1e0e57226cce", + }, + { + name: "single block, multiple tx", + numBlocks: 1, + txData: []string{"0x010203", "0x040506", "0x070809"}, + expectedBlobVersionedHash: "0x01aa8fde33c446276224f47187c7b30f53df61b3a56f5c8876f9c00828642d13", + }, + { + name: "multiple blocks, single tx per block", + numBlocks: 3, + txData: []string{"0x010203"}, + expectedBlobVersionedHash: "0x01b7c6888a192ee1d221eb5fe1e6d15927903541eb178964d851b742c518ccb0", + }, + { + name: "multiple blocks, multiple tx per block", + numBlocks: 3, + txData: []string{"0x010203", "0x040506", "0x070809"}, + expectedBlobVersionedHash: "0x017cd27686cbe8f92b596f0e21f355be1371979624b1a72e6c7471cd5fa782e4", + }, + { + name: "thousands of blocks, multiple tx per block", + numBlocks: 10000, + txData: []string{"0x010203", "0x040506", "0x070809"}, + expectedBlobVersionedHash: "0x016889370d5706071080111716b0f65b6ff9df7d7b1621103b10b0e3eabc9282", + }, + { + name: "single block, single tx, full blob random data -> data bigger compressed than uncompressed", + numBlocks: 1, + txData: []string{generateRandomData(maxAvailableBytesIncompressable - bytesPerBlock)}, + expectedBlobVersionedHash: "0x0128c6dcaa56600132bc09c26af86c31370d0a3dfb8bece776d28291ca3a721e", + }, + { + name: "2 blocks, single tx, full blob random data", + numBlocks: 2, + txData: []string{generateRandomData(maxAvailableBytesIncompressable/2 - bytesPerBlock*2)}, + expectedBlobVersionedHash: "0x018e8240255f00a81140ae445d1784c16f77791b4fe55c07fffa31c9bc15b3a4", + }, + { + name: "single block, single tx, full blob random data -> error because 1 byte too big", + numBlocks: 1, + txData: []string{generateRandomData(maxAvailableBytesIncompressable - bytesPerBlock + 1)}, + creationErr: "blob exceeds maximum size", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var blocks []*Block + for i := 0; i < tc.numBlocks; i++ { + block := &Block{ + Header: &types.Header{ + Number: big.NewInt(int64(i)), + }, + Transactions: []*types.TransactionData{}, + } + for _, data := range tc.txData { + tx := &types.TransactionData{Type: 0xff, Data: data} + block.Transactions = append(block.Transactions, tx) + } + blocks = append(blocks, block) + } + + _, blobVersionedHash, _, _, err := codecV8.(*DACodecV8).constructBlob(&Batch{Blocks: blocks}) + if tc.creationErr != "" { + require.ErrorContains(t, err, tc.creationErr) + return + } + require.NoError(t, err) + require.Equal(t, common.HexToHash(tc.expectedBlobVersionedHash), blobVersionedHash) + }) + } +} diff --git a/encoding/da_test.go b/encoding/da_test.go index a0fa6b2..e2eb004 100644 --- a/encoding/da_test.go +++ b/encoding/da_test.go @@ -129,7 +129,7 @@ func TestBlobCompressDecompress(t *testing.T) { blobBytes, err := hex.DecodeString(blobString) assert.NoError(t, err) - compressed, err := zstd.CompressScrollBatchBytes(blobBytes) + compressed, err := zstd.CompressScrollBatchBytesLegacy(blobBytes) assert.NoError(t, err) blob, err := makeBlobCanonical(compressed) diff --git a/encoding/interfaces.go b/encoding/interfaces.go index 2fc3336..e4ca3c8 100644 --- a/encoding/interfaces.go +++ b/encoding/interfaces.go @@ -76,6 +76,9 @@ type Codec interface { EstimateBatchL1CommitCalldataSize(*Batch) (uint64, error) JSONFromBytes([]byte) ([]byte, error) // convert batch header bytes to JSON, this is only used to provide witness data for the prover. + + // CompressScrollBatchBytes compresses batch bytes using the appropriate compression method for this codec version + CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) } // CodecVersion represents the version of the codec. @@ -140,3 +143,9 @@ func CodecFromConfig(chainCfg *params.ChainConfig, startBlockNumber *big.Int, st return &DACodecV0{} } } + +// CompressScrollBatchBytes compresses batch bytes using the appropriate codec based on block number and timestamp +func CompressScrollBatchBytes(batchBytes []byte, blockNumber uint64, blockTimestamp uint64, chainCfg *params.ChainConfig) ([]byte, error) { + codec := CodecFromConfig(chainCfg, big.NewInt(int64(blockNumber)), blockTimestamp) + return codec.CompressScrollBatchBytes(batchBytes) +} diff --git a/encoding/interfaces_test.go b/encoding/interfaces_test.go index c965781..77c4aa7 100644 --- a/encoding/interfaces_test.go +++ b/encoding/interfaces_test.go @@ -24,6 +24,7 @@ func TestCodecFromVersion(t *testing.T) { {"CodecV5", CodecV5, &DACodecV5{}, false}, {"CodecV6", CodecV6, &DACodecV6{}, false}, {"CodecV7", CodecV7, &DACodecV7{}, false}, + {"CodecV8", CodecV8, &DACodecV8{}, false}, {"InvalidCodec", CodecVersion(99), nil, true}, } diff --git a/encoding/zstd/add_scroll_prefix_in_zstd_related_symbols.sh b/encoding/zstd/add_scroll_prefix_in_zstd_related_symbols.sh deleted file mode 100755 index 10a0498..0000000 --- a/encoding/zstd/add_scroll_prefix_in_zstd_related_symbols.sh +++ /dev/null @@ -1,29 +0,0 @@ -# Generate redefine.syms for linux_amd64 -/opt/homebrew/opt/llvm/bin/llvm-nm libscroll_zstd_linux_amd64.a | awk '/ZSTD|HUF|FSE|ZBUFF/ {if ($3 != "") print $3 " scroll_" $3}' | sort | uniq > redefine_linux_amd64.syms - -# Use llvm-objcopy to modify symbols for linux_amd64 -llvm-objcopy --redefine-syms=redefine_linux_amd64.syms libscroll_zstd_linux_amd64.a libscroll_zstd_linux_amd64_new.a - -# Move the new file to replace the original and clean up -mv libscroll_zstd_linux_amd64_new.a libscroll_zstd_linux_amd64.a -rm redefine_linux_amd64.syms - -# Generate redefine.syms for linux_arm64 -/opt/homebrew/opt/llvm/bin/llvm-nm libscroll_zstd_linux_arm64.a | awk '/ZSTD|HUF|FSE|ZBUFF/ {if ($3 != "") print $3 " scroll_" $3}' | sort | uniq > redefine_linux_arm64.syms - -# Use llvm-objcopy to modify symbols for linux_arm64 -llvm-objcopy --redefine-syms=redefine_linux_arm64.syms libscroll_zstd_linux_arm64.a libscroll_zstd_linux_arm64_new.a - -# Move the new file to replace the original and clean up -mv libscroll_zstd_linux_arm64_new.a libscroll_zstd_linux_arm64.a -rm redefine_linux_arm64.syms - -# Generate redefine.syms for darwin_arm64 -/opt/homebrew/opt/llvm/bin/llvm-nm libscroll_zstd_darwin_arm64.a | awk '/ZSTD|HUF|FSE|ZBUFF/ {if ($3 != "") print $3 " scroll_" $3}' | sort | uniq > redefine_darwin_arm64.syms - -# Use llvm-objcopy to modify symbols for darwin_arm64 -llvm-objcopy --redefine-syms=redefine_darwin_arm64.syms libscroll_zstd_darwin_arm64.a libscroll_zstd_darwin_arm64_new.a - -# Move the new file to replace the original and clean up -mv libscroll_zstd_darwin_arm64_new.a libscroll_zstd_darwin_arm64.a -rm redefine_darwin_arm64.syms diff --git a/encoding/zstd/add_symbol_prefix.sh b/encoding/zstd/add_symbol_prefix.sh new file mode 100755 index 0000000..e25c297 --- /dev/null +++ b/encoding/zstd/add_symbol_prefix.sh @@ -0,0 +1,236 @@ +#!/bin/bash + +set -e + +# Fixed macOS paths +LLVM_NM="/opt/homebrew/opt/llvm/bin/llvm-nm" +LLVM_OBJCOPY="/opt/homebrew/opt/llvm/bin/llvm-objcopy" + +# List of library files to process +LIBRARIES=( + "libencoder_legacy_darwin_arm64.a:scroll_legacy_" + "libencoder_legacy_linux_amd64.a:scroll_legacy_" + "libencoder_legacy_linux_arm64.a:scroll_legacy_" + "libencoder_standard_darwin_arm64.a:scroll_standard_" + "libencoder_standard_linux_amd64.a:scroll_standard_" + "libencoder_standard_linux_arm64.a:scroll_standard_" +) + +echo "=== Adding Symbol Prefixes ===" +echo + +for lib_info in "${LIBRARIES[@]}"; do + IFS=':' read -r LIB_FILE PREFIX <<< "$lib_info" + REDEFINE_FILE="redefine_${LIB_FILE%.*}.syms" + + echo "Processing $LIB_FILE with prefix '$PREFIX'" + + # Check if library file exists + if [ ! -f "$LIB_FILE" ]; then + echo "Warning: Library file not found: $LIB_FILE, skipping..." + continue + fi + + # Check if library is already processed by looking for our prefix + if "$LLVM_NM" "$LIB_FILE" 2>/dev/null | grep -q "${PREFIX}"; then + echo "Library $LIB_FILE already processed (found ${PREFIX} symbols), skipping..." + continue + fi + + # Generate redefine.syms for all potential conflicting symbols + "$LLVM_NM" "$LIB_FILE" | awk ' + /ZSTD|HUF|FSE|ZBUFF|HIST|ERROR|MEM_|XXH|COVER|DICT|POOL|PARAM/ { + if ($3 != "" && $3 !~ /^\./ && $3 !~ /^'"$PREFIX"'/) { + print $3 " '"$PREFIX"'" $3 + } + } + /^[0-9a-fA-F]+ [TDBS] / { + if ($3 != "" && $3 !~ /^\./ && $3 !~ /^__/ && $3 !~ /^'"$PREFIX"'/) { + if ($3 ~ /^(entropy|fse|huf|zstd|hist|error|mem_|pool|param|cover|dict)/) { + print $3 " '"$PREFIX"'" $3 + } + } + } + ' | sort | uniq > "$REDEFINE_FILE" + + # Check if there are symbols to redefine + if [ ! -s "$REDEFINE_FILE" ]; then + echo "No symbols found to redefine in $LIB_FILE" + rm -f "$REDEFINE_FILE" + continue + fi + + echo "Found $(wc -l < "$REDEFINE_FILE") symbols to redefine in $LIB_FILE" + + # Show sample symbols being renamed + echo "Sample symbols to be renamed:" + head -3 "$REDEFINE_FILE" | while read old new; do + echo " $old -> $new" + done + + # Use llvm-objcopy to modify symbols + "$LLVM_OBJCOPY" --redefine-syms="$REDEFINE_FILE" "$LIB_FILE" "${LIB_FILE%.*}_new.a" + + # Move the new file to replace the original and clean up + mv "${LIB_FILE%.*}_new.a" "$LIB_FILE" + rm "$REDEFINE_FILE" + + echo "Successfully processed $LIB_FILE" + echo +done + +echo "=== Symbol Processing Complete ===" +echo +echo "=== Checking for Symbol Conflicts ===" +echo + +# Extract library files for conflict checking +LIB_FILES=() +for lib_info in "${LIBRARIES[@]}"; do + IFS=':' read -r LIB_FILE PREFIX <<< "$lib_info" + LIB_FILES+=("$LIB_FILE") +done + +# Temporary file to store all symbols +temp_file=$(mktemp) + +# Collect all exported symbols from all libraries +echo "Collecting symbols from all libraries..." +for LIB_FILE in "${LIB_FILES[@]}"; do + if [ ! -f "$LIB_FILE" ]; then + echo "Warning: $LIB_FILE not found, skipping..." + continue + fi + + "$LLVM_NM" "$LIB_FILE" 2>/dev/null | awk -v lib="$LIB_FILE" ' + /^[0-9a-fA-F]+ [TDBS] / { + if ($3 != "" && $3 !~ /^\./ && $3 !~ /^__/) { + print $3 "\t" lib + } + } + ' >> "$temp_file" +done + +echo +echo "1. Checking for duplicate symbols across libraries:" + +# Find conflicting symbols +conflicts_output=$(awk '{symbols[$1] = symbols[$1] "\n" $2} END { + conflicts = 0 + for (sym in symbols) { + count = gsub(/\n/, "&", symbols[sym]) + if (count > 1) { + conflicts++ + if (conflicts <= 10) { # Show first 10 conflicts + print " ❌ CONFLICT: " sym + libs = symbols[sym] + gsub(/\n/, ", ", libs) + print " Found in: " libs + print "" + } + } + } + if (conflicts == 0) { + print " ✅ No symbol conflicts found!" + return 0 + } else { + print " ❌ Found " conflicts " conflicting symbols" (conflicts > 10 ? " (showing first 10)" : "") + return conflicts + } +}' "$temp_file") + +echo "$conflicts_output" +conflict_count=$(echo "$conflicts_output" | tail -1 | grep -o '[0-9]\+' | tail -1 || echo 0) + +echo +echo "2. Prefix application verification:" + +for lib_info in "${LIBRARIES[@]}"; do + IFS=':' read -r LIB_FILE PREFIX <<< "$lib_info" + + if [ ! -f "$LIB_FILE" ]; then + continue + fi + + # Count unprefixed target symbols + unprefixed_targets=$("$LLVM_NM" "$LIB_FILE" 2>/dev/null | awk -v prefix="$PREFIX" ' + /^[0-9a-fA-F]+ [TDBS] / { + if ($3 != "" && $3 !~ /^\./ && $3 !~ /^__/ && $3 !~ ("^" prefix)) { + # Check if it matches our target patterns + if ($0 ~ /ZSTD|HUF|FSE|ZBUFF|HIST|ERROR|MEM_|XXH|COVER|DICT|POOL|PARAM/ || + $3 ~ /^(entropy|fse|huf|zstd|hist|error|mem_|pool|param|cover|dict)/) { + print $3 + } + } + }' | wc -l) + + # Count prefixed symbols + prefixed_count=$("$LLVM_NM" "$LIB_FILE" 2>/dev/null | grep -c "${PREFIX}" || echo 0) + + echo " $LIB_FILE:" + echo " - Prefixed symbols (${PREFIX}*): $prefixed_count" + echo " - Unprefixed target symbols: $unprefixed_targets" + + if [ "$unprefixed_targets" -gt 0 ]; then + echo " ⚠️ Still has $unprefixed_targets unprefixed target symbols" + echo " Examples:" + "$LLVM_NM" "$LIB_FILE" 2>/dev/null | awk -v prefix="$PREFIX" ' + /^[0-9a-fA-F]+ [TDBS] / { + if ($3 != "" && $3 !~ /^\./ && $3 !~ /^__/ && $3 !~ ("^" prefix)) { + if ($0 ~ /ZSTD|HUF|FSE|ZBUFF|HIST|ERROR|MEM_|XXH|COVER|DICT|POOL|PARAM/ || + $3 ~ /^(entropy|fse|huf|zstd|hist|error|mem_|pool|param|cover|dict)/) { + print " " $3 + } + } + }' | head -3 + else + echo " ✅ All target symbols properly prefixed" + fi + echo +done + +echo "3. Sample prefixed symbols from each library:" +for lib_info in "${LIBRARIES[@]}"; do + IFS=':' read -r LIB_FILE PREFIX <<< "$lib_info" + + if [ ! -f "$LIB_FILE" ]; then + continue + fi + + echo " $LIB_FILE (sample ${PREFIX}* symbols):" + "$LLVM_NM" "$LIB_FILE" 2>/dev/null | awk -v prefix="$PREFIX" ' + /^[0-9a-fA-F]+ [TDBS] / { + if ($3 ~ ("^" prefix)) { + print " " $3 + } + }' | head -3 + echo +done + +echo "4. Preserved original functions:" +for LIB_FILE in "${LIB_FILES[@]}"; do + if [ ! -f "$LIB_FILE" ]; then + continue + fi + + echo " $LIB_FILE:" + "$LLVM_NM" "$LIB_FILE" 2>/dev/null | grep -E "(compress_scroll_batch_bytes_)" | awk '{print " " $2 " " $3}' || echo " No original functions found" + echo +done + +# Cleanup +rm "$temp_file" + +echo "=== Final Analysis ===" + +if [ "$conflict_count" -eq 0 ]; then + echo "🎉 SUCCESS: All libraries processed successfully with no symbol conflicts!" + echo "✅ All target symbols have been properly prefixed" + echo "✅ Original functions preserved" +else + echo "⚠️ WARNING: Found $conflict_count symbol conflicts that need attention." + echo "📋 Please review the conflicts listed above" +fi + +echo +echo "=== Process Complete ===" diff --git a/encoding/zstd/libscroll_zstd_darwin_arm64.a b/encoding/zstd/libencoder_legacy_darwin_arm64.a similarity index 96% rename from encoding/zstd/libscroll_zstd_darwin_arm64.a rename to encoding/zstd/libencoder_legacy_darwin_arm64.a index 9642681..60415f6 100644 Binary files a/encoding/zstd/libscroll_zstd_darwin_arm64.a and b/encoding/zstd/libencoder_legacy_darwin_arm64.a differ diff --git a/encoding/zstd/libscroll_zstd_linux_amd64.a b/encoding/zstd/libencoder_legacy_linux_amd64.a similarity index 90% rename from encoding/zstd/libscroll_zstd_linux_amd64.a rename to encoding/zstd/libencoder_legacy_linux_amd64.a index c4385a5..aba177c 100644 Binary files a/encoding/zstd/libscroll_zstd_linux_amd64.a and b/encoding/zstd/libencoder_legacy_linux_amd64.a differ diff --git a/encoding/zstd/libscroll_zstd_linux_arm64.a b/encoding/zstd/libencoder_legacy_linux_arm64.a similarity index 96% rename from encoding/zstd/libscroll_zstd_linux_arm64.a rename to encoding/zstd/libencoder_legacy_linux_arm64.a index 02183c1..69365ea 100644 Binary files a/encoding/zstd/libscroll_zstd_linux_arm64.a and b/encoding/zstd/libencoder_legacy_linux_arm64.a differ diff --git a/encoding/zstd/libencoder_standard_darwin_arm64.a b/encoding/zstd/libencoder_standard_darwin_arm64.a new file mode 100644 index 0000000..0d03645 Binary files /dev/null and b/encoding/zstd/libencoder_standard_darwin_arm64.a differ diff --git a/encoding/zstd/libencoder_standard_linux_amd64.a b/encoding/zstd/libencoder_standard_linux_amd64.a new file mode 100644 index 0000000..81a84d7 Binary files /dev/null and b/encoding/zstd/libencoder_standard_linux_amd64.a differ diff --git a/encoding/zstd/libencoder_standard_linux_arm64.a b/encoding/zstd/libencoder_standard_linux_arm64.a new file mode 100644 index 0000000..4b222a5 Binary files /dev/null and b/encoding/zstd/libencoder_standard_linux_arm64.a differ diff --git a/encoding/zstd/libscroll_zstd_darwin_arm64.go b/encoding/zstd/libscroll_zstd_darwin_arm64.go index d83ec17..65b8660 100644 --- a/encoding/zstd/libscroll_zstd_darwin_arm64.go +++ b/encoding/zstd/libscroll_zstd_darwin_arm64.go @@ -1,6 +1,9 @@ +//go:build darwin && arm64 && !musl +// +build darwin,arm64,!musl + package zstd /* -#cgo LDFLAGS: ${SRCDIR}/libscroll_zstd_darwin_arm64.a +#cgo LDFLAGS: ${SRCDIR}/libencoder_legacy_darwin_arm64.a ${SRCDIR}/libencoder_standard_darwin_arm64.a */ import "C" diff --git a/encoding/zstd/libscroll_zstd_linux_amd64.go b/encoding/zstd/libscroll_zstd_linux_amd64.go index f1a686e..1b030c1 100644 --- a/encoding/zstd/libscroll_zstd_linux_amd64.go +++ b/encoding/zstd/libscroll_zstd_linux_amd64.go @@ -1,9 +1,9 @@ -//go:build !musl -// +build !musl +//go:build linux && amd64 && !musl +// +build linux,amd64,!musl package zstd /* -#cgo LDFLAGS: ${SRCDIR}/libscroll_zstd_linux_amd64.a +#cgo LDFLAGS: ${SRCDIR}/libencoder_legacy_linux_amd64.a ${SRCDIR}/libencoder_standard_linux_amd64.a */ import "C" diff --git a/encoding/zstd/libscroll_zstd_linux_arm64.go b/encoding/zstd/libscroll_zstd_linux_arm64.go index f3775d2..e577556 100644 --- a/encoding/zstd/libscroll_zstd_linux_arm64.go +++ b/encoding/zstd/libscroll_zstd_linux_arm64.go @@ -1,9 +1,9 @@ -//go:build !musl -// +build !musl +//go:build linux && arm64 && !musl +// +build linux,arm64,!musl package zstd /* -#cgo LDFLAGS: ${SRCDIR}/libscroll_zstd_linux_arm64.a +#cgo LDFLAGS: ${SRCDIR}/libencoder_legacy_linux_arm64.a ${SRCDIR}/libencoder_standard_linux_arm64.a */ import "C" diff --git a/encoding/zstd/zstd.go b/encoding/zstd/zstd.go index aab718f..79de46c 100644 --- a/encoding/zstd/zstd.go +++ b/encoding/zstd/zstd.go @@ -2,7 +2,8 @@ package zstd /* #include -char* compress_scroll_batch_bytes(uint8_t* src, uint64_t src_size, uint8_t* output_buf, uint64_t *output_buf_size); +char* compress_scroll_batch_bytes_legacy(uint8_t* src, uint64_t src_size, uint8_t* output_buf, uint64_t *output_buf_size); +char* compress_scroll_batch_bytes_standard(uint8_t* src, uint64_t src_size, uint8_t* output_buf, uint64_t *output_buf_size); */ import "C" @@ -13,10 +14,11 @@ import ( const compressBufferOverhead = 128 -// CompressScrollBatchBytes compresses the given batch of bytes using zstd compression. +// CompressScrollBatchBytesLegacy compresses the given batch of bytes using zstd compression. +// This function uses the customized scroll-tech/zstd-rs fork version for codec v2-v7. // The output buffer is allocated with an extra compressBufferOverhead bytes to accommodate // potential metadata overhead or error messages from the underlying C function. -func CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) { +func CompressScrollBatchBytesLegacy(batchBytes []byte) ([]byte, error) { if len(batchBytes) == 0 { return nil, fmt.Errorf("input batch is empty") } @@ -25,9 +27,30 @@ func CompressScrollBatchBytes(batchBytes []byte) ([]byte, error) { outbufSize := C.uint64_t(len(batchBytes) + compressBufferOverhead) outbuf := make([]byte, outbufSize) - if err := C.compress_scroll_batch_bytes((*C.uchar)(unsafe.Pointer(&batchBytes[0])), srcSize, + if err := C.compress_scroll_batch_bytes_legacy((*C.uchar)(unsafe.Pointer(&batchBytes[0])), srcSize, (*C.uchar)(unsafe.Pointer(&outbuf[0])), &outbufSize); err != nil { - return nil, fmt.Errorf("failed to compress scroll batch bytes: %s", C.GoString(err)) + return nil, fmt.Errorf("failed to compress scroll batch bytes (legacy): %s", C.GoString(err)) + } + + return outbuf[:int(outbufSize)], nil +} + +// CompressScrollBatchBytesStandard compresses the given batch of bytes using zstd compression. +// This function uses the standard zstd 0.13 experimental version for codec v8 and later. +// The output buffer is allocated with an extra compressBufferOverhead bytes to accommodate +// potential metadata overhead or error messages from the underlying C function. +func CompressScrollBatchBytesStandard(batchBytes []byte) ([]byte, error) { + if len(batchBytes) == 0 { + return nil, fmt.Errorf("input batch is empty") + } + + srcSize := C.uint64_t(len(batchBytes)) + outbufSize := C.uint64_t(len(batchBytes) + compressBufferOverhead) + outbuf := make([]byte, outbufSize) + + if err := C.compress_scroll_batch_bytes_standard((*C.uchar)(unsafe.Pointer(&batchBytes[0])), srcSize, + (*C.uchar)(unsafe.Pointer(&outbuf[0])), &outbufSize); err != nil { + return nil, fmt.Errorf("failed to compress scroll batch bytes (standard): %s", C.GoString(err)) } return outbuf[:int(outbufSize)], nil diff --git a/libzstd/.gitignore b/libzstd/encoder-legacy/.gitignore similarity index 100% rename from libzstd/.gitignore rename to libzstd/encoder-legacy/.gitignore diff --git a/libzstd/Cargo.lock b/libzstd/encoder-legacy/Cargo.lock similarity index 94% rename from libzstd/Cargo.lock rename to libzstd/encoder-legacy/Cargo.lock index 46480d5..4c5375b 100644 --- a/libzstd/Cargo.lock +++ b/libzstd/encoder-legacy/Cargo.lock @@ -14,7 +14,7 @@ dependencies = [ ] [[package]] -name = "encoder" +name = "encoder-legacy" version = "0.1.0" dependencies = [ "zstd", @@ -47,13 +47,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" -[[package]] -name = "scroll-zstd" -version = "0.1.0" -dependencies = [ - "encoder", -] - [[package]] name = "zstd" version = "0.13.0" diff --git a/libzstd/encoder/Cargo.toml b/libzstd/encoder-legacy/Cargo.toml similarity index 64% rename from libzstd/encoder/Cargo.toml rename to libzstd/encoder-legacy/Cargo.toml index 468f863..781bb8e 100644 --- a/libzstd/encoder/Cargo.toml +++ b/libzstd/encoder-legacy/Cargo.toml @@ -1,10 +1,12 @@ [package] -name = "encoder" +name = "encoder-legacy" version = "0.1.0" edition = "2021" - # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + [lib] +name = "encoder_legacy" +crate-type = ["staticlib"] [dependencies] -zstd = { git = "https://github.com/scroll-tech/zstd-rs", branch = "hack/mul-block", features = ["experimental"]} \ No newline at end of file +zstd = { git = "https://github.com/scroll-tech/zstd-rs", branch = "hack/mul-block", features = ["experimental"] } diff --git a/libzstd/encoder-legacy/Makefile b/libzstd/encoder-legacy/Makefile new file mode 100644 index 0000000..05f43b5 --- /dev/null +++ b/libzstd/encoder-legacy/Makefile @@ -0,0 +1,56 @@ +.PHONY: all clean build install + +# Detect platform +UNAME_S := $(shell uname -s) +UNAME_M := $(shell uname -m) + +ifeq ($(UNAME_S),Linux) + ifeq ($(UNAME_M),x86_64) + PLATFORM := linux_amd64 + else ifeq ($(UNAME_M),aarch64) + PLATFORM := linux_arm64 + endif +else ifeq ($(UNAME_S),Darwin) + ifeq ($(UNAME_M),arm64) + PLATFORM := darwin_arm64 + # Set macOS deployment target to avoid version warnings + export MACOSX_DEPLOYMENT_TARGET := 14.0 + else ifeq ($(UNAME_M),x86_64) + PLATFORM := darwin_amd64 + # Set macOS deployment target to avoid version warnings + export MACOSX_DEPLOYMENT_TARGET := 14.0 + endif +endif + +TARGET_DIR := ../../encoding/zstd + +all: build + +build: + @echo "Building legacy encoder for $(PLATFORM)..." +ifeq ($(UNAME_S),Darwin) + @echo "Setting macOS deployment target to $(MACOSX_DEPLOYMENT_TARGET)" + MACOSX_DEPLOYMENT_TARGET=$(MACOSX_DEPLOYMENT_TARGET) cargo build --release +else + cargo build --release +endif + +install: build + @echo "Installing legacy library to $(TARGET_DIR)..." + cp target/release/libencoder_legacy.a $(TARGET_DIR)/libencoder_legacy_$(PLATFORM).a +ifeq ($(UNAME_S),Darwin) + @echo "Fixing library symbol table..." + ranlib $(TARGET_DIR)/libencoder_legacy_$(PLATFORM).a +endif + @echo "Legacy installation complete!" + +clean: + cargo clean + rm -f $(TARGET_DIR)/libencoder_legacy_$(PLATFORM).a + +info: + @echo "Legacy encoder - Platform: $(PLATFORM)" + @echo "Target directory: $(TARGET_DIR)" +ifeq ($(UNAME_S),Darwin) + @echo "macOS deployment target: $(MACOSX_DEPLOYMENT_TARGET)" +endif diff --git a/libzstd/rust-toolchain b/libzstd/encoder-legacy/rust-toolchain similarity index 100% rename from libzstd/rust-toolchain rename to libzstd/encoder-legacy/rust-toolchain diff --git a/libzstd/encoder-legacy/src/lib.rs b/libzstd/encoder-legacy/src/lib.rs new file mode 100644 index 0000000..ae268e7 --- /dev/null +++ b/libzstd/encoder-legacy/src/lib.rs @@ -0,0 +1,101 @@ +use core::slice; +use std::io::Write; +use std::os::raw::{c_char, c_uchar}; +use std::ptr::null; +use zstd::stream::Encoder; +use zstd::zstd_safe::{CParameter, ParamSwitch}; + +// re-export zstd +pub use zstd; + +// we use offset window no more than = 17 +// TODO: use for multi-block zstd. +#[allow(dead_code)] +pub const CL_WINDOW_LIMIT: usize = 17; + +/// zstd block size target. +pub const N_BLOCK_SIZE_TARGET: u32 = 124 * 1024; + +/// Maximum number of blocks that we can expect in the encoded data. +pub const N_MAX_BLOCKS: u64 = 10; + +/// Zstd encoder configuration +pub fn init_zstd_encoder(target_block_size: u32) -> Encoder<'static, Vec> { + let mut encoder = Encoder::new(Vec::new(), 0).expect("infallible"); + + // disable compression of literals, i.e. literals will be raw bytes. + encoder + .set_parameter(CParameter::LiteralCompressionMode(ParamSwitch::Disable)) + .expect("infallible"); + // with a hack in zstd we can set window log <= CL_WINDOW_LIMIT with single segment kept + encoder + .set_parameter(CParameter::WindowLog(CL_WINDOW_LIMIT.try_into().unwrap())) + .expect("infallible"); + // set target block size to fit within a single block. + encoder + .set_parameter(CParameter::TargetCBlockSize(target_block_size)) + .expect("infallible"); + // do not include the checksum at the end of the encoded data. + encoder.include_checksum(false).expect("infallible"); + // do not include magic bytes at the start of the frame since we will have a single + // frame. + encoder.include_magicbytes(false).expect("infallible"); + // do not include dictionary id so we have more simple content + encoder.include_dictid(false).expect("infallible"); + // include the content size to know at decode time the expected size of decoded + // data. + encoder.include_contentsize(true).expect("infallible"); + + encoder +} + +/// Helper function to convert error message to C-style string in output buffer +fn out_as_err(err: &str, out: &mut [u8]) -> *const c_char { + let msg = if err.len() + 1 > out.len() { + "compress_scroll_batch_bytes_legacy: not enough output buffer for the error message" + } else { + err + }; + + let cpy_src = unsafe { slice::from_raw_parts(msg.as_ptr(), msg.len()) }; + out[..cpy_src.len()].copy_from_slice(cpy_src); + out[cpy_src.len()] = 0; // build the c-style string + out.as_ptr() as *const c_char +} + +/// Legacy compression function for codec v2-v7 +/// Uses the customized scroll-tech/zstd-rs implementation +#[no_mangle] +pub unsafe extern "C" fn compress_scroll_batch_bytes_legacy( + src: *const c_uchar, + src_size: u64, + output_buf: *mut c_uchar, + output_buf_size: *mut u64, +) -> *const c_char { + let buf_size = *output_buf_size; + let src = unsafe { slice::from_raw_parts(src, src_size as usize) }; + let out = unsafe { slice::from_raw_parts_mut(output_buf, buf_size as usize) }; + + let mut encoder = init_zstd_encoder(N_BLOCK_SIZE_TARGET); + encoder.set_pledged_src_size(Some(src.len() as u64)).expect( + "compress_scroll_batch_bytes_legacy: failed to set pledged src size, should be infallible", + ); + + let ret = encoder.write_all(src); + let ret = ret.and_then(|_| encoder.finish()); + if let Err(e) = ret { + return out_as_err(e.to_string().as_str(), out); + } + + let ret = ret.unwrap(); + if ret.len() > buf_size as usize { + return out_as_err( + "compress_scroll_batch_bytes_legacy: not enough output buffer for compressed data", + out, + ); + } + out[..ret.len()].copy_from_slice(&ret); + *output_buf_size = ret.len() as u64; + + null() +} diff --git a/libzstd/encoder-standard/.gitignore b/libzstd/encoder-standard/.gitignore new file mode 100644 index 0000000..dd635c6 --- /dev/null +++ b/libzstd/encoder-standard/.gitignore @@ -0,0 +1,2 @@ +/target +/_obj diff --git a/libzstd/encoder-standard/Cargo.lock b/libzstd/encoder-standard/Cargo.lock new file mode 100644 index 0000000..1e0124e --- /dev/null +++ b/libzstd/encoder-standard/Cargo.lock @@ -0,0 +1,76 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cc" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d32a725bc159af97c3e629873bb9f88fb8cf8a4867175f76dc987815ea07c83b" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] + +[[package]] +name = "encoder-standard" +version = "0.1.0" +dependencies = [ + "zstd", +] + +[[package]] +name = "jobserver" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.15+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/libzstd/Cargo.toml b/libzstd/encoder-standard/Cargo.toml similarity index 56% rename from libzstd/Cargo.toml rename to libzstd/encoder-standard/Cargo.toml index 0298b15..6887315 100644 --- a/libzstd/Cargo.toml +++ b/libzstd/encoder-standard/Cargo.toml @@ -1,21 +1,12 @@ -[workspace] -members = [ - "encoder", -] - [package] -name = "scroll-zstd" +name = "encoder-standard" version = "0.1.0" edition = "2021" +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] +name = "encoder_standard" crate-type = ["staticlib"] - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] -zstd-encoder = { package = "encoder", path = "encoder"} - -[features] -scroll = [ ] +zstd = { version = "0.13", features = ["experimental"] } diff --git a/libzstd/encoder-standard/Makefile b/libzstd/encoder-standard/Makefile new file mode 100644 index 0000000..a25d742 --- /dev/null +++ b/libzstd/encoder-standard/Makefile @@ -0,0 +1,56 @@ +.PHONY: all clean build install + +# Detect platform +UNAME_S := $(shell uname -s) +UNAME_M := $(shell uname -m) + +ifeq ($(UNAME_S),Linux) + ifeq ($(UNAME_M),x86_64) + PLATFORM := linux_amd64 + else ifeq ($(UNAME_M),aarch64) + PLATFORM := linux_arm64 + endif +else ifeq ($(UNAME_S),Darwin) + ifeq ($(UNAME_M),arm64) + PLATFORM := darwin_arm64 + # Set macOS deployment target to avoid version warnings + export MACOSX_DEPLOYMENT_TARGET := 14.0 + else ifeq ($(UNAME_M),x86_64) + PLATFORM := darwin_amd64 + # Set macOS deployment target to avoid version warnings + export MACOSX_DEPLOYMENT_TARGET := 14.0 + endif +endif + +TARGET_DIR := ../../encoding/zstd + +all: build + +build: + @echo "Building standard encoder for $(PLATFORM)..." +ifeq ($(UNAME_S),Darwin) + @echo "Setting macOS deployment target to $(MACOSX_DEPLOYMENT_TARGET)" + MACOSX_DEPLOYMENT_TARGET=$(MACOSX_DEPLOYMENT_TARGET) cargo build --release +else + cargo build --release +endif + +install: build + @echo "Installing standard library to $(TARGET_DIR)..." + cp target/release/libencoder_standard.a $(TARGET_DIR)/libencoder_standard_$(PLATFORM).a +ifeq ($(UNAME_S),Darwin) + @echo "Fixing library symbol table..." + ranlib $(TARGET_DIR)/libencoder_standard_$(PLATFORM).a +endif + @echo "Standard installation complete!" + +clean: + cargo clean + rm -f $(TARGET_DIR)/libencoder_standard_$(PLATFORM).a + +info: + @echo "Standard encoder - Platform: $(PLATFORM)" + @echo "Target directory: $(TARGET_DIR)" +ifeq ($(UNAME_S),Darwin) + @echo "macOS deployment target: $(MACOSX_DEPLOYMENT_TARGET)" +endif diff --git a/libzstd/encoder-standard/rust-toolchain b/libzstd/encoder-standard/rust-toolchain new file mode 100644 index 0000000..27c108b --- /dev/null +++ b/libzstd/encoder-standard/rust-toolchain @@ -0,0 +1 @@ +nightly-2023-12-03 diff --git a/libzstd/encoder-standard/src/lib.rs b/libzstd/encoder-standard/src/lib.rs new file mode 100644 index 0000000..4cd41a1 --- /dev/null +++ b/libzstd/encoder-standard/src/lib.rs @@ -0,0 +1,101 @@ +use core::slice; +use std::io::Write; +use std::os::raw::{c_char, c_uchar}; +use std::ptr::null; +use zstd::stream::Encoder; +use zstd::zstd_safe::{CParameter, ParamSwitch}; + +// re-export zstd +pub use zstd; + +// we use offset window no more than = 22 +// TODO: use for multi-block zstd. +#[allow(dead_code)] +pub const CL_WINDOW_LIMIT: usize = 22; + +/// zstd block size target. +pub const N_BLOCK_SIZE_TARGET: u32 = 124 * 1024; + +/// Maximum number of blocks that we can expect in the encoded data. +pub const N_MAX_BLOCKS: u64 = 10; + +/// Zstd encoder configuration +pub fn init_zstd_encoder(target_block_size: u32) -> Encoder<'static, Vec> { + let mut encoder = Encoder::new(Vec::new(), 0).expect("infallible"); + + // disable compression of literals, i.e. literals will be raw bytes. + encoder + .set_parameter(CParameter::LiteralCompressionMode(ParamSwitch::Disable)) + .expect("infallible"); + // with a hack in zstd we can set window log <= CL_WINDOW_LIMIT with single segment kept + encoder + .set_parameter(CParameter::WindowLog(CL_WINDOW_LIMIT.try_into().unwrap())) + .expect("infallible"); + // set target block size to fit within a single block. + encoder + .set_parameter(CParameter::TargetCBlockSize(target_block_size)) + .expect("infallible"); + // do not include the checksum at the end of the encoded data. + encoder.include_checksum(false).expect("infallible"); + // do not include magic bytes at the start of the frame since we will have a single + // frame. + encoder.include_magicbytes(false).expect("infallible"); + // do not include dictionary id so we have more simple content + encoder.include_dictid(false).expect("infallible"); + // include the content size to know at decode time the expected size of decoded + // data. + encoder.include_contentsize(true).expect("infallible"); + + encoder +} + +/// Helper function to convert error message to C-style string in output buffer +fn out_as_err(err: &str, out: &mut [u8]) -> *const c_char { + let msg = if err.len() + 1 > out.len() { + "compress_scroll_batch_bytes_standard: not enough output buffer for the error message" + } else { + err + }; + + let cpy_src = unsafe { slice::from_raw_parts(msg.as_ptr(), msg.len()) }; + out[..cpy_src.len()].copy_from_slice(cpy_src); + out[cpy_src.len()] = 0; // build the c-style string + out.as_ptr() as *const c_char +} + +/// Standard compression function for codec v8 and later. +/// Uses the customized scroll-tech/zstd-rs implementation +#[no_mangle] +pub unsafe extern "C" fn compress_scroll_batch_bytes_standard( + src: *const c_uchar, + src_size: u64, + output_buf: *mut c_uchar, + output_buf_size: *mut u64, +) -> *const c_char { + let buf_size = *output_buf_size; + let src = unsafe { slice::from_raw_parts(src, src_size as usize) }; + let out = unsafe { slice::from_raw_parts_mut(output_buf, buf_size as usize) }; + + let mut encoder = init_zstd_encoder(N_BLOCK_SIZE_TARGET); + encoder.set_pledged_src_size(Some(src.len() as u64)).expect( + "compress_scroll_batch_bytes_standard: failed to set pledged src size, should be infallible", + ); + + let ret = encoder.write_all(src); + let ret = ret.and_then(|_| encoder.finish()); + if let Err(e) = ret { + return out_as_err(e.to_string().as_str(), out); + } + + let ret = ret.unwrap(); + if ret.len() > buf_size as usize { + return out_as_err( + "compress_scroll_batch_bytes_standard: not enough output buffer for compressed data", + out, + ); + } + out[..ret.len()].copy_from_slice(&ret); + *output_buf_size = ret.len() as u64; + + null() +} diff --git a/libzstd/encoder/src/lib.rs b/libzstd/encoder/src/lib.rs deleted file mode 100644 index 30bb075..0000000 --- a/libzstd/encoder/src/lib.rs +++ /dev/null @@ -1,46 +0,0 @@ -use zstd::stream::Encoder; -use zstd::zstd_safe::{CParameter, ParamSwitch}; - -// re-export zstd -pub use zstd; - -// we use offset window no more than = 17 -// TODO: use for multi-block zstd. -#[allow(dead_code)] -pub const CL_WINDOW_LIMIT: usize = 17; - -/// zstd block size target. -pub const N_BLOCK_SIZE_TARGET: u32 = 124 * 1024; - -/// Maximum number of blocks that we can expect in the encoded data. -pub const N_MAX_BLOCKS: u64 = 10; - -/// Zstd encoder configuration -pub fn init_zstd_encoder(target_block_size: u32) -> Encoder<'static, Vec> { - let mut encoder = Encoder::new(Vec::new(), 0).expect("infallible"); - - // disable compression of literals, i.e. literals will be raw bytes. - encoder - .set_parameter(CParameter::LiteralCompressionMode(ParamSwitch::Disable)) - .expect("infallible"); - // with a hack in zstd we can set window log <= 17 with single segment kept - encoder - .set_parameter(CParameter::WindowLog(17)) - .expect("infallible"); - // set target block size to fit within a single block. - encoder - .set_parameter(CParameter::TargetCBlockSize(target_block_size)) - .expect("infallible"); - // do not include the checksum at the end of the encoded data. - encoder.include_checksum(false).expect("infallible"); - // do not include magic bytes at the start of the frame since we will have a single - // frame. - encoder.include_magicbytes(false).expect("infallible"); - // do not include dictionary id so we have more simple content - encoder.include_dictid(false).expect("infallible"); - // include the content size to know at decode time the expected size of decoded - // data. - encoder.include_contentsize(true).expect("infallible"); - - encoder -} diff --git a/libzstd/src/lib.rs b/libzstd/src/lib.rs deleted file mode 100644 index 34331ab..0000000 --- a/libzstd/src/lib.rs +++ /dev/null @@ -1,54 +0,0 @@ -use core::slice; -use std::io::Write; -use std::os::raw::{c_char, c_uchar}; -use std::ptr::null; -use zstd_encoder::{init_zstd_encoder, N_BLOCK_SIZE_TARGET}; - -fn out_as_err(err: &str, out: &mut [u8]) -> *const c_char { - let msg = if err.len() + 1 > out.len() { - "compress_scroll_batch_bytes: not enough output buffer for the error message" - } else { - err - }; - - let cpy_src = unsafe { slice::from_raw_parts(msg.as_ptr(), msg.len()) }; - out[..cpy_src.len()].copy_from_slice(cpy_src); - out[cpy_src.len()] = 0; // build the c-style string - out.as_ptr() as *const c_char -} - -/// Entry -#[no_mangle] -pub unsafe extern "C" fn compress_scroll_batch_bytes( - src: *const c_uchar, - src_size: u64, - output_buf: *mut c_uchar, - output_buf_size: *mut u64, -) -> *const c_char { - let buf_size = *output_buf_size; - let src = unsafe { slice::from_raw_parts(src, src_size as usize) }; - let out = unsafe { slice::from_raw_parts_mut(output_buf, buf_size as usize) }; - - let mut encoder = init_zstd_encoder(N_BLOCK_SIZE_TARGET); - encoder.set_pledged_src_size(Some(src.len() as u64)).expect( - "compress_scroll_batch_bytes: failed to set pledged src size, should be infallible", - ); - - let ret = encoder.write_all(src); - let ret = ret.and_then(|_| encoder.finish()); - if let Err(e) = ret { - return out_as_err(e.to_string().as_str(), out); - } - - let ret = ret.unwrap(); - if ret.len() > buf_size as usize { - return out_as_err( - "compress_scroll_batch_bytes: not enough output buffer for compressed data", - out, - ); - } - out[..ret.len()].copy_from_slice(&ret); - *output_buf_size = ret.len() as u64; - - null() -}