-
Notifications
You must be signed in to change notification settings - Fork 15
feat: get piece leaf count #249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
cf89ea3
96d6a5a
4c5e0a6
2018a6e
aaccbe1
5972649
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
import type { LegacyPieceLink as LegacyPieceCIDType, PieceLink as PieceCIDType } from '@web3-storage/data-segment' | ||
import * as Hasher from '@web3-storage/data-segment/multihash' | ||
import { fromLink } from '@web3-storage/data-segment/piece' | ||
import { CID } from 'multiformats/cid' | ||
import * as Raw from 'multiformats/codecs/raw' | ||
import * as Digest from 'multiformats/hashes/digest' | ||
|
@@ -164,6 +165,47 @@ export function calculate(data: Uint8Array): PieceCID { | |
return Link.create(Raw.code, digest) | ||
} | ||
|
||
/** | ||
* Extract leaf count from a PieceCID v2 | ||
* @param pieceCid - The PieceCID to extract leaf count from | ||
* @returns The leaf count (number of leaves in the merkle tree) or null if invalid | ||
*/ | ||
export function getLeafCount(pieceCid: PieceCID | CID | string): number | null { | ||
const validPieceCid = asPieceCID(pieceCid) | ||
if (!validPieceCid) { | ||
return null | ||
} | ||
|
||
try { | ||
const piece = fromLink(validPieceCid) | ||
// The leaf count is 2^height | ||
return 2 ** piece.height | ||
} catch { | ||
return null | ||
} | ||
} | ||
|
||
/** | ||
* Extract raw size from a PieceCID v2 | ||
* @param pieceCid - The PieceCID to extract raw size from | ||
* @returns The raw size in bytes or null if invalid | ||
*/ | ||
export function getRawSize(pieceCid: PieceCID | CID | string): number | null { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite right because height only gets us the padded size, we need to unpad using the padding that's also encoded. See #283 |
||
const validPieceCid = asPieceCID(pieceCid) | ||
if (!validPieceCid) { | ||
return null | ||
} | ||
|
||
try { | ||
const piece = fromLink(validPieceCid) | ||
// Raw size is leaf count * 32 bytes | ||
const leafCount = 2 ** piece.height | ||
return leafCount * 32 | ||
} catch { | ||
return null | ||
} | ||
} | ||
|
||
/** | ||
* Create a TransformStream that calculates PieceCID while streaming data through it | ||
* This allows calculating PieceCID without buffering the entire data in memory | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,14 +22,18 @@ | |
* ``` | ||
*/ | ||
|
||
import type { ethers } from 'ethers' | ||
import { ethers } from 'ethers' | ||
import { CID } from 'multiformats/cid' | ||
import type { PaymentsService } from '../payments/index.ts' | ||
import { PDPAuthHelper, PDPServer } from '../pdp/index.ts' | ||
import { asPieceCID } from '../piece/index.ts' | ||
import { PDPVerifier } from '../pdp/verifier.ts' | ||
import { asPieceCID, getLeafCount, getRawSize } from '../piece/index.ts' | ||
import { SPRegistryService } from '../sp-registry/index.ts' | ||
import type { ProviderInfo } from '../sp-registry/types.ts' | ||
import type { Synapse } from '../synapse.ts' | ||
import type { | ||
DataSetPieceData, | ||
DataSetPieceDataWithLeafCount, | ||
DownloadOptions, | ||
EnhancedDataSetInfo, | ||
MetadataEntry, | ||
|
@@ -1260,6 +1264,116 @@ export class StorageContext { | |
return dataSetData.pieces.map((piece) => piece.pieceCid) | ||
} | ||
|
||
async getPiecesWithDetails(options?: { | ||
batchSize?: number | ||
signal?: AbortSignal | ||
}): Promise<DataSetPieceDataWithLeafCount[]> { | ||
const pieces: DataSetPieceDataWithLeafCount[] = [] | ||
|
||
for await (const piece of this.getAllActivePiecesGenerator(options)) { | ||
// TODO: should we call the contract for leaf count? i.e. pdpVerifier.getPieceLeafCount(this._dataSetId, piece.pieceId) | ||
const leafCount = getLeafCount(piece.pieceCid) ?? 0 | ||
Comment on lines
+1274
to
+1275
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's ditch this |
||
// TODO: is there a better way to get the raw size? | ||
const rawSize = getRawSize(piece.pieceCid) ?? 0 | ||
Comment on lines
+1274
to
+1277
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the only place where we're getting setting rawSize and leafCount now.. should we be calling the contract instead of these helper methods? |
||
pieces.push({ | ||
pieceId: piece.pieceId, | ||
pieceCid: piece.pieceCid, | ||
rawSize, | ||
leafCount, | ||
subPieceCid: piece.pieceCid, | ||
subPieceOffset: 0, // TODO: figure out how to get the sub piece offset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how do I fulfill this value accurately? |
||
} satisfies DataSetPieceDataWithLeafCount) | ||
} | ||
|
||
return pieces | ||
} | ||
|
||
/** | ||
* Get all active pieces for this data set directly from the PDPVerifier contract. | ||
* This bypasses Curio and gets the authoritative piece list from the blockchain. | ||
* @param options - Optional configuration object | ||
* @param options.batchSize - The batch size for each pagination call (default: 100) | ||
* @param options.signal - Optional AbortSignal to cancel the operation | ||
* @returns Array of all active pieces with their details including PieceCID | ||
*/ | ||
async getAllActivePieces(options?: { batchSize?: number; signal?: AbortSignal }): Promise<Array<DataSetPieceData>> { | ||
const allPieces: Array<DataSetPieceData> = [] | ||
|
||
for await (const piece of this.getAllActivePiecesGenerator(options)) { | ||
allPieces.push({ | ||
pieceId: piece.pieceId, | ||
pieceCid: piece.pieceCid, | ||
subPieceCid: piece.pieceCid, | ||
subPieceOffset: 0, // TODO: figure out how to get the sub piece offset | ||
} satisfies DataSetPieceData) | ||
} | ||
|
||
return allPieces | ||
} | ||
|
||
/** | ||
* Get all active pieces for this data set as an async generator. | ||
* This provides lazy evaluation and better memory efficiency for large data sets. | ||
* Gets data directly from PDPVerifier contract (source of truth) rather than Curio. | ||
* @param options - Optional configuration object | ||
* @param options.batchSize - The batch size for each pagination call (default: 100) | ||
* @param options.signal - Optional AbortSignal to cancel the operation | ||
* @yields Individual pieces with their details including PieceCID | ||
*/ | ||
async *getAllActivePiecesGenerator(options?: { | ||
batchSize?: number | ||
signal?: AbortSignal | ||
}): AsyncGenerator<DataSetPieceData> { | ||
const pdpVerifierAddress = this._warmStorageService.getPDPVerifierAddress() | ||
const pdpVerifier = new PDPVerifier(this._synapse.getProvider(), pdpVerifierAddress) | ||
|
||
const batchSize = options?.batchSize ?? 100 | ||
const signal = options?.signal | ||
let offset = 0 | ||
let hasMore = true | ||
|
||
while (hasMore) { | ||
if (signal?.aborted) { | ||
throw createError('StorageContext', 'getAllActivePiecesGenerator', 'Operation aborted') | ||
} | ||
|
||
const result = await pdpVerifier.getActivePieces(this._dataSetId, { offset, limit: batchSize, signal }) | ||
|
||
// Yield pieces one by one for lazy evaluation | ||
for (let i = 0; i < result.pieces.length; i++) { | ||
if (signal?.aborted) { | ||
throw createError('StorageContext', 'getAllActivePiecesGenerator', 'Operation aborted') | ||
} | ||
|
||
// Parse the piece data as a PieceCID | ||
// The contract stores the full PieceCID multihash digest (including height and padding) | ||
// The data comes as a hex string from ethers, we need to decode it as bytes then as a CID | ||
const pieceDataHex = result.pieces[i].data | ||
const pieceDataBytes = ethers.getBytes(pieceDataHex) | ||
|
||
const cid = CID.decode(pieceDataBytes) | ||
const pieceCid = asPieceCID(cid) | ||
if (!pieceCid) { | ||
throw createError( | ||
'StorageContext', | ||
'getAllActivePiecesGenerator', | ||
`Invalid PieceCID returned from contract for piece ${result.pieceIds[i]}` | ||
) | ||
} | ||
|
||
yield { | ||
pieceId: result.pieceIds[i], | ||
pieceCid, | ||
subPieceCid: pieceCid, | ||
subPieceOffset: 0, // TODO: figure out how to get the sub piece offset | ||
} satisfies DataSetPieceData | ||
Comment on lines
+1348
to
+1369
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: a core contributor should really eyeball this to make sure i'm doing things properly |
||
} | ||
|
||
hasMore = result.hasMore | ||
offset += batchSize | ||
} | ||
} | ||
|
||
/** | ||
* Check if a piece exists on this service provider. | ||
* @param pieceCid - The PieceCID (piece CID) to check | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,15 @@ import type { API } from '@web3-storage/data-segment' | |
import { Size, toLink } from '@web3-storage/data-segment/piece' | ||
import { assert } from 'chai' | ||
import { CID } from 'multiformats/cid' | ||
import { asLegacyPieceCID, asPieceCID, calculate, createPieceCIDStream, type PieceCID } from '../piece/index.ts' | ||
import { | ||
asLegacyPieceCID, | ||
asPieceCID, | ||
calculate, | ||
createPieceCIDStream, | ||
getLeafCount, | ||
getRawSize, | ||
type PieceCID, | ||
} from '../piece/index.ts' | ||
|
||
// https://github.com/filecoin-project/go-fil-commp-hashhash/blob/master/testdata/zero.txt | ||
const zeroPieceCidFixture = ` | ||
|
@@ -246,4 +254,57 @@ describe('PieceCID utilities', () => { | |
// more complex async coordination, so we keep this test simple | ||
}) | ||
}) | ||
|
||
describe('getLeafCount', () => { | ||
zeroPieceCidFixture.forEach(([size, , v1]) => { | ||
it(`should extract correct leaf count for size ${size}`, () => { | ||
const v2 = toPieceCID(BigInt(size), v1) | ||
const leafCount = getLeafCount(v2) | ||
|
||
// Expected leaf count is 2^height where height is calculated from size | ||
const expectedHeight = Size.Unpadded.toHeight(BigInt(size)) | ||
const expectedLeafCount = 2 ** expectedHeight | ||
|
||
assert.isNotNull(leafCount) | ||
assert.strictEqual(leafCount, expectedLeafCount) | ||
Comment on lines
+264
to
+269
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this accurate? |
||
}) | ||
}) | ||
|
||
it('should return null for invalid PieceCID', () => { | ||
const result = getLeafCount(invalidCidString) | ||
assert.isNull(result) | ||
}) | ||
|
||
it('should return null for null input', () => { | ||
const result = getLeafCount(null as any) | ||
assert.isNull(result) | ||
}) | ||
}) | ||
|
||
describe('getRawSize', () => { | ||
zeroPieceCidFixture.forEach(([size, , v1]) => { | ||
it(`should extract correct raw size for size ${size}`, () => { | ||
const v2 = toPieceCID(BigInt(size), v1) | ||
const rawSize = getRawSize(v2) | ||
|
||
// Expected raw size is leaf count * 32 | ||
const expectedHeight = Size.Unpadded.toHeight(BigInt(size)) | ||
const expectedLeafCount = 2 ** expectedHeight | ||
const expectedRawSize = expectedLeafCount * 32 | ||
Comment on lines
+290
to
+293
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this accurate? |
||
|
||
assert.isNotNull(rawSize) | ||
assert.strictEqual(rawSize, expectedRawSize) | ||
}) | ||
}) | ||
|
||
it('should return null for invalid PieceCID', () => { | ||
const result = getRawSize(invalidCidString) | ||
assert.isNull(result) | ||
}) | ||
|
||
it('should return null for null input', () => { | ||
const result = getRawSize(null as any) | ||
assert.isNull(result) | ||
}) | ||
}) | ||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
callout here.. not sure if piece.ts leafCount calculation is enough?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is ok, but I don't think it's all that useful here; let's remove it for now and ignore leaf counts -- they mostly shouldn't be a concern to the user other than their fairly close relationship to size