Skip to content

Commit e1abd74

Browse files
authored
Fix download_sources for DAPT tutorial (#569)
Signed-off-by: Sarah Yurick <[email protected]>
1 parent 0158d93 commit e1abd74

File tree

2 files changed

+18
-16
lines changed

2 files changed

+18
-16
lines changed

tutorials/dapt-curation/code/docbuilder.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
1515
import gzip
1616
import os
1717
import re
18-
from typing import Set, Tuple
1918
from zipfile import ZipFile, ZipInfo
2019

2120
import arxiv as arxiv
@@ -33,7 +32,7 @@
3332

3433
class WikitxtDownloader(DocumentDownloader):
3534
"""
36-
A class for downloading data from wiki urls.
35+
A class for downloading data from wiki URLs.
3736
"""
3837

3938
def __init__(self, download_dir: str):
@@ -130,9 +129,9 @@ def split_meta(example):
130129

131130

132131
class WikitxtExtractor(DocumentExtractor):
133-
def extract(self, content: str) -> Tuple[Set, str]:
132+
def extract(self, content: str) -> dict:
134133
# No metadata for the text, just the content.
135-
return {}, content
134+
return {"text": content}
136135

137136

138137
class GitHubDownloader(DocumentDownloader):
@@ -338,9 +337,9 @@ def iterate(self, file_path: str):
338337

339338

340339
class GitHubExtractor(DocumentExtractor):
341-
def extract(self, content: str):
340+
def extract(self, content: str) -> dict:
342341
# Just return the content.
343-
return {}, content
342+
return {"text": content}
344343

345344

346345
class ArxivDownloader(DocumentDownloader):
@@ -470,6 +469,6 @@ def iterate(self, file_path: str):
470469

471470

472471
class ArxivExtractor(DocumentExtractor):
473-
def extract(self, content: str):
472+
def extract(self, content: str) -> dict:
474473
# Just return the content.
475-
return {}, content
474+
return {"text": content}

tutorials/dapt-curation/code/downloaders.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -78,6 +78,7 @@ def download_wikipedia_sources(
7878
"line_count": int,
7979
"size_in_bytes": int,
8080
"path": str,
81+
"file_name": str,
8182
}
8283

8384
downloader = WikitxtDownloader(output_dir)
@@ -92,8 +93,8 @@ def download_wikipedia_sources(
9293
extractor=extractor,
9394
output_format=output_format,
9495
)
95-
# Force the computation of the dataset
96-
dataset.persist()
96+
97+
dataset.to_json(output_dir, write_to_filename="file_name")
9798
return output_dir
9899

99100

@@ -151,6 +152,7 @@ def download_github_sources(
151152
"line_count": int,
152153
"size_in_bytes": int,
153154
"path": str,
155+
"file_name": str,
154156
}
155157

156158
dataset = download_and_extract(
@@ -164,8 +166,8 @@ def download_github_sources(
164166
output_format=output_format,
165167
keep_raw_download=True,
166168
)
167-
# Force the computation of the dataset
168-
dataset.persist()
169+
170+
dataset.to_json(output_jsonl_dir, write_to_filename="file_name")
169171
return output_jsonl_dir
170172

171173

@@ -225,6 +227,7 @@ def download_pdf_sources(
225227
"line_count": int,
226228
"size_in_bytes": int,
227229
"path": str,
230+
"file_name": str,
228231
}
229232

230233
dataset = download_and_extract(
@@ -238,6 +241,6 @@ def download_pdf_sources(
238241
output_format=output_format,
239242
keep_raw_download=True,
240243
)
241-
# Force the computation of the dataset
242-
dataset.persist()
244+
245+
dataset.to_json(output_jsonl_dir, write_to_filename="file_name")
243246
return output_jsonl_dir

0 commit comments

Comments
 (0)