OpenCSGs · 425zekunwang · Aug 6, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@ dist/
 .pypirc
 .vscode/
 .DS_Store
+.codesouler/
diff --git a/build.py b/build.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+Build script for handling version information
+"""
+
+import re
+from pathlib import Path
+
+
+def get_version():
+    """Get version from pyproject.toml"""
+    pyproject_path = Path("pyproject.toml")
+    if pyproject_path.exists():
+        with open(pyproject_path, "r", encoding="utf-8") as f:
+            content = f.read()
+            match = re.search(r'version = "([^"]+)"', content)
+            if match:
+                return match.group(1)
+    return "0.1.0"  # default version
+
+
+def update_version_in_init():
+    """Update version in __init__.py"""
+    init_path = Path("pycsghub/__init__.py")
+    if init_path.exists():
+        with open(init_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        # Update version
+        new_content = re.sub(
+            r'__version__ = "[^"]*"',
+            f'__version__ = "{get_version()}"',
+            content
+        )
+
+        with open(init_path, "w", encoding="utf-8") as f:
+            f.write(new_content)
+        print(f"Updated version to {get_version()} in __init__.py")
+
+
+if __name__ == "__main__":
+    update_version_in_init()
diff --git a/examples/download_dataset.py b/examples/download_dataset.py
@@ -1,4 +1,5 @@
 from pycsghub.snapshot_download import snapshot_download
+
 # token = "your access token"
 token = None
 

diff --git a/examples/download_file.py b/examples/download_file.py
@@ -1,4 +1,5 @@
 from pycsghub.file_download import file_download
+
 # token = "your access token"
 token = None
 
@@ -7,11 +8,11 @@
 repo_id = 'OpenCSG/csg-wukong-1B'
 local_dir = "/Users/hhwang/temp/wukong"
 result = file_download(
-    repo_id, 
-    file_name='README.md', 
-    local_dir=local_dir, 
-    endpoint=endpoint, 
-    token=token, 
+    repo_id,
+    file_name='README.md',
+    local_dir=local_dir,
+    endpoint=endpoint,
+    token=token,
     repo_type=repo_type)
 
 print(f"Save file to {result}")
diff --git a/examples/download_file_parallel.py b/examples/download_file_parallel.py
@@ -0,0 +1,60 @@
+import logging
+
+from pycsghub.file_download import file_download, snapshot_download_parallel
+
+# Configure logging level
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# token = "your access token"
+token = None
+
+endpoint = "https://hub.opencsg.com"
+repo_type = "model"
+repo_id = 'OpenCSG/csg-wukong-1B'
+local_dir = "/Users/hhwang/temp/wukong"
+
+print("=== Single-file multi-threaded download example ===")
+result = file_download(
+    repo_id,
+    file_name='README.md',
+    local_dir=local_dir,
+    endpoint=endpoint,
+    token=token,
+    repo_type=repo_type,
+    max_workers=4,
+    use_parallel=True
+)
+
+print(f"Single-file multi-threaded downloaded ,save to: {result}")
+
+print("\n=== Example of multi-threaded download for the entire repository ===")
+cache_dir = "/Users/hhwang/temp/"
+allow_patterns = ["*.json", "*.md", "*.txt"]
+
+result = snapshot_download_parallel(
+    repo_id,
+    repo_type=repo_type,
+    cache_dir=cache_dir,
+    endpoint=endpoint,
+    token=token,
+    allow_patterns=allow_patterns,
+    max_workers=6,
+    use_parallel=True,  
+    verbose=True 
+)
+
+print(f"Repository downloaded, save to: {result}")
+
+print("\n=== Example of single-threaded download comparison ===")
+
+result_single = file_download(
+    repo_id,
+    file_name='README.md',
+    local_dir=local_dir,
+    endpoint=endpoint,
+    token=token,
+    repo_type=repo_type,
+    use_parallel=False
+)
+
+print(f"Single-threaded downloaded, save to: {result_single}")
diff --git a/examples/download_model.py b/examples/download_model.py
@@ -1,4 +1,5 @@
 from pycsghub.snapshot_download import snapshot_download
+
 # token = "your access token"
 token = None
 
@@ -10,13 +11,12 @@
 ignore_patterns = ["tokenizer.json"]
 
 result = snapshot_download(
-    repo_id, 
-    repo_type=repo_type, 
-    local_dir=local_dir, 
-    endpoint=endpoint, 
+    repo_id,
+    repo_type=repo_type,
+    local_dir=local_dir,
+    endpoint=endpoint,
     token=token,
     allow_patterns=allow_patterns,
     ignore_patterns=ignore_patterns)
 
 print(f"Save model to {result}")
-
diff --git a/examples/download_with_custom_progress.py b/examples/download_with_custom_progress.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Demonstrate how to use custom progress display for model download
+"""
+
+import time
+from datetime import datetime
+from pycsghub import snapshot_download
+
+
+class CustomProgressTracker:
+    """Custom progress tracker"""
+
+    def __init__(self):
+        self.start_time = None
+        self.last_update_time = None
+
+    def progress_callback(self, progress_info):
+        """Custom progress callback function"""
+        current_time = datetime.now()
+
+        if self.start_time is None:
+            self.start_time = current_time
+            self.last_update_time = current_time
+
+        time_since_last = (current_time - self.last_update_time).total_seconds()
+        if time_since_last >= 1.0 or progress_info['current_downloaded'] == progress_info['total_files']:
+            self._print_progress(progress_info, current_time)
+            self.last_update_time = current_time
+
+    def _print_progress(self, progress_info, current_time):
+        """Print progress information"""
+        total_files = progress_info['total_files']
+        current_downloaded = progress_info['current_downloaded']
+        success_count = progress_info['success_count']
+        failed_count = progress_info['failed_count']
+        remaining_count = progress_info['remaining_count']
+
+        if total_files > 0:
+            progress_percent = (current_downloaded / total_files) * 100
+        else:
+            progress_percent = 0
+
+        elapsed_time = (current_time - self.start_time).total_seconds()
+
+        if current_downloaded > 0:
+            avg_time_per_file = elapsed_time / current_downloaded
+            estimated_remaining = avg_time_per_file * remaining_count
+        else:
+            estimated_remaining = 0
+
+        bar_length = 30
+        filled_length = int(bar_length * progress_percent / 100)
+        bar = '█' * filled_length + '░' * (bar_length - filled_length)
+
+        print(f"\r[{bar}] {progress_percent:5.1f}% | "
+              f"Downloaded: {current_downloaded}/{total_files} | "
+              f"Success: {success_count} | "
+              f"Failed: {failed_count} | "
+              f"Remaining: {remaining_count} | "
+              f"Elapsed: {elapsed_time:.1f}s | "
+              f"Estimated remaining: {estimated_remaining:.1f}s", end='', flush=True)
+
+        # If download completed, newline
+        if current_downloaded == total_files:
+            print()  # Newline
+
+
+def main():
+    """
+    Main function - Demonstrate custom progress tracking
+    """
+    print("Start demonstrating custom progress tracking for model download...")
+
+    progress_tracker = CustomProgressTracker()
+
+    repo_id = "OpenCSG/csg-wukong-1B"
+
+    try:
+        local_path = snapshot_download(
+            repo_id=repo_id,
+            progress_callback=progress_tracker.progress_callback,
+            verbose=False,
+            use_parallel=True,
+            max_workers=4
+        )
+
+        print(f"\n✅ Download completed! Model saved to: {local_path}")
+
+    except Exception as e:
+        print(f"\n❌ Error during download: {e}")
+
+
+if __name__ == "__main__":
+    main() 
diff --git a/examples/download_with_progress.py b/examples/download_with_progress.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+"""
+Demonstrate how to use the progress callback feature of snapshot_download
+"""
+
+import time
+from pycsghub import snapshot_download
+
+
+def progress_callback(progress_info):
+    """
+    Progress callback function
+    Receives download progress information and prints it
+    """
+    print(f"\n=== Download progress update ===")
+    print(f"Total files: {progress_info['total_files']}")
+    print(f"Current downloaded: {progress_info['current_downloaded']}")
+    print(f"Success count: {progress_info['success_count']}")
+    print(f"Failed count: {progress_info['failed_count']}")
+    print(f"Remaining count: {progress_info['remaining_count']}")
+
+    if progress_info['successful_files']:
+        print(f"Recently successful downloaded file: {progress_info['successful_files'][-1]}")
+
+    if progress_info['remaining_files']:
+        print(f"Next file to download: {progress_info['remaining_files'][0]}")
+
+    if progress_info['total_files'] > 0:
+        progress_percent = (progress_info['current_downloaded'] / progress_info['total_files']) * 100
+        print(f"Overall progress: {progress_percent:.1f}%")
+
+    print("=" * 30)
+
+
+def main():
+    """
+    Main function - Demonstrate download with progress callback
+    """
+    print("Start demonstrating download with progress callback...")
+
+    # Example model ID (please replace with actual model ID)
+    repo_id = "example/model"
+
+    try:
+        # Use progress callback to download model
+        local_path = snapshot_download(
+            repo_id=repo_id,
+            progress_callback=progress_callback,
+            verbose=True,
+            use_parallel=True,
+            max_workers=4
+        )
+
+        print(f"\nDownload completed! Model saved to: {local_path}")
+
+    except Exception as e:
+        print(f"Error during download: {e}")
+
+
+if __name__ == "__main__":
+    main() 
diff --git a/examples/run_finetune_bert.py b/examples/run_finetune_bert.py
@@ -1,12 +1,12 @@
 from typing import Any
-import pandas as pd
 
+import pandas as pd
 from transformers import DataCollatorWithPadding
-from transformers import TrainingArguments
 from transformers import Trainer
+from transformers import TrainingArguments
 
+from pycsghub.repo_reader import AutoModelForSequenceClassification, AutoTokenizer
 from pycsghub.repo_reader import load_dataset
-from pycsghub.repo_reader import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 
 model_id_or_path = "wanghh2000/bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=True)
@@ -18,29 +18,34 @@
 access_token = None
 raw_datasets = load_dataset(dsPath, dsName, token=access_token)
 
+
 def get_data_proprocess() -> Any:
-    def preprocess_function(examples: pd.DataFrame):            
+    def preprocess_function(examples: pd.DataFrame):
         ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=100)
         ret = {**examples, **ret}
         return pd.DataFrame.from_dict(ret)
+
     return preprocess_function
 
+
 train_dataset = raw_datasets["train"].select(range(20)).map(get_data_proprocess(), batched=True)
 eval_dataset = raw_datasets["validation"].select(range(20)).map(get_data_proprocess(), batched=True)
 
+
 def data_collator() -> Any:
     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
     return data_collator
 
+
 outputDir = "/Users/hhwang/temp/ff"
 args = TrainingArguments(
     outputDir,
     evaluation_strategy="steps",
     save_strategy="steps",
     logging_strategy="steps",
-    logging_steps = 2,
-    save_steps = 10,
-    eval_steps = 2,
+    logging_steps=2,
+    save_steps=10,
+    eval_steps=2,
     learning_rate=2e-5,
     per_device_train_batch_size=4,
     per_device_eval_batch_size=4,

diff --git a/examples/run_wukong_inference.py b/examples/run_wukong_inference.py
@@ -1,4 +1,3 @@
-import os 
 from pycsghub.repo_reader import AutoModelForCausalLM, AutoTokenizer
 
 mid = 'OpenCSG/csg-wukong-1B'
@@ -7,4 +6,4 @@
 
 inputs = tokenizer.encode("Write a short story", return_tensors="pt")
 outputs = model.generate(inputs)
-print('result: ',tokenizer.batch_decode(outputs))
+print('result: ', tokenizer.batch_decode(outputs))
diff --git a/examples/upload_repo.py b/examples/upload_repo.py
@@ -11,4 +11,4 @@
     repo_type="dataset",
 )
 
-r.upload()
+r.upload()