Fix download_sources for DAPT tutorial (#569)

sarahyurick · web-flow · commit e1abd744093b · 2025-02-25T12:50:08.000-08:00
Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/tutorials/dapt-curation/code/docbuilder.py b/tutorials/dapt-curation/code/docbuilder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
 import gzip
 import os
 import re
-from typing import Set, Tuple
 from zipfile import ZipFile, ZipInfo
 
 import arxiv as arxiv
@@ -33,7 +32,7 @@
 
 class WikitxtDownloader(DocumentDownloader):
     """
-    A class for downloading data from wiki urls.
+    A class for downloading data from wiki URLs.
     """
 
     def __init__(self, download_dir: str):
@@ -130,9 +129,9 @@ def split_meta(example):
 
 
 class WikitxtExtractor(DocumentExtractor):
-    def extract(self, content: str) -> Tuple[Set, str]:
+    def extract(self, content: str) -> dict:
         # No metadata for the text, just the content.
-        return {}, content
+        return {"text": content}
 
 
 class GitHubDownloader(DocumentDownloader):
@@ -338,9 +337,9 @@ def iterate(self, file_path: str):
 
 
 class GitHubExtractor(DocumentExtractor):
-    def extract(self, content: str):
+    def extract(self, content: str) -> dict:
         # Just return the content.
-        return {}, content
+        return {"text": content}
 
 
 class ArxivDownloader(DocumentDownloader):
@@ -470,6 +469,6 @@ def iterate(self, file_path: str):
 
 
 class ArxivExtractor(DocumentExtractor):
-    def extract(self, content: str):
+    def extract(self, content: str) -> dict:
         # Just return the content.
-        return {}, content
+        return {"text": content}
diff --git a/tutorials/dapt-curation/code/downloaders.py b/tutorials/dapt-curation/code/downloaders.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,6 +78,7 @@ def download_wikipedia_sources(
         "line_count": int,
         "size_in_bytes": int,
         "path": str,
+        "file_name": str,
     }
 
     downloader = WikitxtDownloader(output_dir)
@@ -92,8 +93,8 @@ def download_wikipedia_sources(
         extractor=extractor,
         output_format=output_format,
     )
-    # Force the computation of the dataset
-    dataset.persist()
+
+    dataset.to_json(output_dir, write_to_filename="file_name")
     return output_dir
 
 
@@ -151,6 +152,7 @@ def download_github_sources(
         "line_count": int,
         "size_in_bytes": int,
         "path": str,
+        "file_name": str,
     }
 
     dataset = download_and_extract(
@@ -164,8 +166,8 @@ def download_github_sources(
         output_format=output_format,
         keep_raw_download=True,
     )
-    # Force the computation of the dataset
-    dataset.persist()
+
+    dataset.to_json(output_jsonl_dir, write_to_filename="file_name")
     return output_jsonl_dir
 
 
@@ -225,6 +227,7 @@ def download_pdf_sources(
         "line_count": int,
         "size_in_bytes": int,
         "path": str,
+        "file_name": str,
     }
 
     dataset = download_and_extract(
@@ -238,6 +241,6 @@ def download_pdf_sources(
         output_format=output_format,
         keep_raw_download=True,
     )
-    # Force the computation of the dataset
-    dataset.persist()
+
+    dataset.to_json(output_jsonl_dir, write_to_filename="file_name")
     return output_jsonl_dir