Skip to content

Commit e9bc8f3

Browse files
author
Bob Strahan
committed
> Refactor FCC dataset deployer to use lightweight HuggingFace Hub API instead of datasets library
1 parent 50f6824 commit e9bc8f3

File tree

4 files changed

+47
-26
lines changed

4 files changed

+47
-26
lines changed

CHANGELOG.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@ SPDX-License-Identifier: MIT-0
99

1010
- **RealKIE-FCC-Verified Dataset Auto-Deployment for Test Studio**
1111
- Added fully automatic deployment of the public RealKIE-FCC-Verified dataset from HuggingFace during stack deployment with zero manual steps
12-
- **Direct PDF Download**: Downloads original PDF files from HuggingFace repository's `/pdfs` directory using `hf_hub_download` API
12+
- **Lightweight Implementation**: Uses `hf_hub_download()` API for both parquet metadata and PDF files, with `pyarrow` for efficient parquet reading - total package size ~20MB (well under 250MB Lambda limit)
13+
- **Direct File Download**: Downloads original PDF files from HuggingFace repository's `/pdfs` directory and parquet metadata from `/data` directory using unified `hf_hub_download()` approach
1314
- **Complete Dataset Deployment**: 75 FCC invoice documents (PDFs + ground truth) automatically deployed to TestSetBucket and registered in Test Studio
1415
- **Zero User Effort**: Test set immediately available in Test Studio UI post-deployment - no manual downloads, no local files, no additional scripts
1516
- **Version Control**: Dataset version pinned to CloudFormation CustomResource property enabling controlled updates when new dataset versions are released
1617
- **Efficient Updates**: Skips re-download on stack updates unless dataset version changes, preventing unnecessary deployment time
17-
- **Ground Truth Included**: Complete baseline data extracted from HuggingFace `json_response` field in accelerator format (Agency, Advertiser, GrossTotal, PaymentTerms, AgencyCommission, NetAmountDue, LineItems)
18+
- **Ground Truth Included**: Complete baseline data extracted from HuggingFace parquet `json_response` field in accelerator format (Agency, Advertiser, GrossTotal, PaymentTerms, AgencyCommission, NetAmountDue, LineItems)
1819
- **S3 Structure**: Organized in TestSetBucket with proper `input/{doc_id}.pdf` and `baseline/{doc_id}.pdf/sections/1/result.json` structure
19-
- **Lambda Implementation**: Custom Resource Lambda function (900s timeout, 2GB memory) with HuggingFace datasets library and Hub API for direct PDF access
20+
- **Lambda Implementation**: Custom Resource Lambda function (900s timeout, 2GB memory) with minimal dependencies (huggingface-hub, pyarrow, boto3, crhelper)
2021
- **Single Data Source**: Everything sourced from the public HuggingFace dataset - fully reproducible and deployable anywhere
2122
- **Use Cases**: Immediate testing capability after deployment, benchmark dataset for evaluating extraction performance, training and demonstration purposes
2223
- **Configuration**: Controlled by `FccDatasetDeployment` CustomResource with configurable `DatasetVersion` property (default: "1.0")

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.4.7
1+
0.4.7-wip1

src/lambda/fcc_dataset_deployer/index.py

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,13 @@
1111
from typing import Dict, Any
1212
import cfnresponse
1313

14-
# HuggingFace datasets library - will fail fast if not available
15-
from datasets import load_dataset
14+
# Set HuggingFace cache to /tmp (Lambda's writable directory)
15+
os.environ['HF_HOME'] = '/tmp/huggingface'
16+
os.environ['HUGGINGFACE_HUB_CACHE'] = '/tmp/huggingface/hub'
17+
18+
# Lightweight HuggingFace access
1619
from huggingface_hub import hf_hub_download
20+
import pyarrow.parquet as pq
1721

1822
# Configure logging
1923
logger = logging.getLogger()
@@ -43,8 +47,7 @@ def handler(event, context):
4347
request_type = event['RequestType']
4448

4549
if request_type == 'Delete':
46-
# On stack deletion, we optionally clean up
47-
# For now, we'll leave the data in place
50+
# On stack deletion, we leave the data in place
4851
logger.info("Delete request - keeping dataset in place")
4952
cfnresponse.send(event, context, cfnresponse.SUCCESS, {})
5053
return
@@ -118,40 +121,56 @@ def check_existing_version(version: str) -> bool:
118121
def deploy_dataset(version: str, description: str) -> Dict[str, Any]:
119122
"""
120123
Deploy the dataset by downloading PDFs and ground truth from HuggingFace
121-
and uploading to S3.
124+
using lightweight hf_hub_download and pyarrow.
122125
"""
123126
try:
127+
# Ensure cache directory exists in /tmp (Lambda's writable directory)
128+
cache_dir = '/tmp/huggingface/hub'
129+
os.makedirs(cache_dir, exist_ok=True)
130+
logger.info(f"Using cache directory: {cache_dir}")
131+
124132
logger.info(f"Downloading dataset from HuggingFace: amazon-agi/RealKIE-FCC-Verified")
125133

126-
# Download the dataset metadata (for ground truth)
127-
dataset = load_dataset("amazon-agi/RealKIE-FCC-Verified", split='test')
134+
# Download the parquet file with metadata using hf_hub_download
135+
parquet_path = hf_hub_download(
136+
repo_id="amazon-agi/RealKIE-FCC-Verified",
137+
filename="data/test-00000-of-00001.parquet",
138+
repo_type="dataset",
139+
cache_dir=cache_dir
140+
)
141+
142+
logger.info(f"Downloaded parquet metadata file")
143+
144+
# Read parquet file with pyarrow
145+
table = pq.read_table(parquet_path)
146+
data_dict = table.to_pydict()
128147

129-
logger.info(f"Dataset loaded with {len(dataset)} documents")
148+
num_documents = len(data_dict['id'])
149+
logger.info(f"Loaded {num_documents} documents from parquet")
130150

131151
# Process and upload each document
132152
file_count = 0
133153
skipped_count = 0
134154

135-
for idx, item in enumerate(dataset):
155+
for idx in range(num_documents):
136156
try:
137-
document_id = item.get('id', f'doc_{idx}')
157+
document_id = data_dict['id'][idx]
158+
json_response = data_dict['json_response'][idx]
138159

139-
# Get ground truth from json_response field
140-
json_response = item.get('json_response', {})
141160
if not json_response:
142161
logger.warning(f"Skipping {document_id}: no json_response")
143162
skipped_count += 1
144163
continue
145164

146165
logger.info(f"Processing {document_id}")
147166

148-
# Download PDF file from HuggingFace repository
149-
# PDFs are stored in the /pdfs directory of the dataset repo
167+
# Download PDF file from HuggingFace repository using hf_hub_download
150168
try:
151169
pdf_path = hf_hub_download(
152170
repo_id="amazon-agi/RealKIE-FCC-Verified",
153171
filename=f"pdfs/{document_id}",
154-
repo_type="dataset"
172+
repo_type="dataset",
173+
cache_dir=cache_dir
155174
)
156175

157176
# Read the downloaded PDF
@@ -174,7 +193,7 @@ def deploy_dataset(version: str, description: str) -> Dict[str, Any]:
174193
skipped_count += 1
175194
continue
176195

177-
# Upload ground truth baseline (already in correct format!)
196+
# Upload ground truth baseline (wrap in inference_result)
178197
result_json = {"inference_result": json_response}
179198
result_key = f'{DATASET_PREFIX}baseline/{document_id}/sections/1/result.json'
180199
s3_client.put_object(
@@ -187,7 +206,7 @@ def deploy_dataset(version: str, description: str) -> Dict[str, Any]:
187206
file_count += 1
188207

189208
if file_count % 10 == 0:
190-
logger.info(f"Processed {file_count}/{len(dataset)} documents...")
209+
logger.info(f"Processed {file_count}/{num_documents} documents...")
191210

192211
except Exception as e:
193212
logger.error(f"Error processing document {idx} ({document_id}): {e}")
@@ -233,8 +252,7 @@ def create_testset_record(version: str, description: str, file_count: int):
233252
'datasetVersion': version,
234253
'createdAt': timestamp,
235254
'updatedAt': timestamp,
236-
'source': 'huggingface:amazon-agi/RealKIE-FCC-Verified',
237-
'ExpiresAfter': int((datetime.utcnow().timestamp() + (365 * 24 * 60 * 60))) # 1 year TTL
255+
'source': 'huggingface:amazon-agi/RealKIE-FCC-Verified'
238256
}
239257

240258
table.put_item(Item=item)
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
# HuggingFace datasets library for downloading the RealKIE-FCC-Verified dataset
2-
datasets>=2.14.0
1+
# Lightweight HuggingFace file download (no heavy datasets library)
32
huggingface-hub>=0.20.0
43

4+
# For reading parquet metadata files efficiently
5+
pyarrow>=20.0.0
6+
57
# AWS SDK (boto3 is already available in Lambda runtime, but specified for local testing)
68
boto3>=1.34.0
79

810
# For CloudFormation custom resource responses
9-
crhelper>=2.0.11
11+
cfnresponse

0 commit comments

Comments
 (0)