-
Notifications
You must be signed in to change notification settings - Fork 233
Fix cloud compatibility issues in Text Components by using fsspec and posixpath #1037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
14624db
5680195
dc2acfa
7dc6111
903688e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,11 +13,13 @@ | |
| # limitations under the License. | ||
|
|
||
| import os | ||
| import posixpath | ||
| import subprocess | ||
| from abc import ABC, abstractmethod | ||
| from dataclasses import dataclass | ||
| from typing import Any | ||
|
|
||
| import fsspec | ||
| from loguru import logger | ||
|
|
||
| from nemo_curator.stages.base import ProcessingStage | ||
|
|
@@ -37,7 +39,9 @@ | |
| """ | ||
| self._download_dir = download_dir | ||
| self._verbose = verbose | ||
| os.makedirs(download_dir, exist_ok=True) | ||
| # Use fsspec for cloud-compatible directory creation | ||
| fs, _ = fsspec.core.url_to_fs(download_dir) | ||
| fs.makedirs(download_dir, exist_ok=True) | ||
|
|
||
| def _check_s5cmd_installed(self) -> bool: | ||
| """Check if s5cmd is installed.""" | ||
|
|
@@ -87,14 +91,24 @@ | |
| """ | ||
| # Generate output filename | ||
| output_name = self._get_output_filename(url) | ||
| output_file = os.path.join(self._download_dir, output_name) | ||
| output_file = posixpath.join(self._download_dir, output_name) | ||
| temp_file = output_file + ".tmp" | ||
|
|
||
| # Use fsspec for cloud-compatible file operations | ||
| fs, _ = fsspec.core.url_to_fs(output_file) | ||
|
|
||
|
Check failure on line 99 in nemo_curator/stages/text/download/base/download.py
|
||
| # If final file exists and is non-empty, assume it's complete | ||
| if os.path.exists(output_file) and os.path.getsize(output_file) > 0: | ||
| if self._verbose: | ||
| logger.info(f"File: {output_file} exists. Not downloading") | ||
| return output_file | ||
| if fs.exists(output_file): | ||
| try: | ||
| file_info = fs.info(output_file) | ||
| file_size = file_info.get("size", 0) | ||
| if file_size > 0: | ||
| if self._verbose: | ||
| logger.info(f"File: {output_file} exists. Not downloading") | ||
| return output_file | ||
| except Exception: | ||
|
Check failure on line 109 in nemo_curator/stages/text/download/base/download.py
|
||
| # If we can't get file info, proceed with download | ||
| pass | ||
|
Check failure on line 111 in nemo_curator/stages/text/download/base/download.py
|
||
|
|
||
| # Download to temporary file | ||
| success, error_message = self._download_to_path(url, temp_file) | ||
|
|
@@ -103,8 +117,13 @@ | |
| # Download successful, atomically move temp file to final location | ||
| os.rename(temp_file, output_file) | ||
| if self._verbose: | ||
| file_size = os.path.getsize(output_file) | ||
| logger.info(f"Successfully downloaded to {output_file} ({file_size} bytes)") | ||
| try: | ||
| fs, _ = fsspec.core.url_to_fs(output_file) | ||
| file_info = fs.info(output_file) | ||
| file_size = file_info.get("size", 0) | ||
| logger.info(f"Successfully downloaded to {output_file} ({file_size} bytes)") | ||
| except Exception: | ||
|
Check failure on line 125 in nemo_curator/stages/text/download/base/download.py
|
||
|
||
| logger.info(f"Successfully downloaded to {output_file}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But it wasn't successful in this case?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot , please fix
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! Fixed the misleading message - now it clearly logs "Successfully downloaded" even when we can't get the file size, and adds a debug message explaining why the size couldn't be retrieved. Fixed in commit 903688e.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed the misleading message in commit 903688e - now it clearly logs "Successfully downloaded" even when we can't get the file size, and adds a debug message explaining why the size couldn't be retrieved. |
||
| return output_file | ||
| else: | ||
| # Download failed | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need this try/except?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@copilot , please fix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right - I simplified the exception handling. Removed the unnecessary try/except for the file existence check (lines 102-111) since if
fs.exists()returns True,fs.info()should generally work. Improved the remaining try/catch for logging with better error messaging. Fixed in commit 903688e.