Skip to content
Open
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ dist/
.pypirc
.vscode/
.DS_Store
.codesouler/
42 changes: 42 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python3
"""
Build script for handling version information
"""

import re
from pathlib import Path


def get_version():
"""Get version from pyproject.toml"""
pyproject_path = Path("pyproject.toml")
if pyproject_path.exists():
with open(pyproject_path, "r", encoding="utf-8") as f:
content = f.read()
match = re.search(r'version = "([^"]+)"', content)
if match:
return match.group(1)
return "0.1.0" # default version


def update_version_in_init():
"""Update version in __init__.py"""
init_path = Path("pycsghub/__init__.py")
if init_path.exists():
with open(init_path, "r", encoding="utf-8") as f:
content = f.read()

# Update version
new_content = re.sub(
r'__version__ = "[^"]*"',
f'__version__ = "{get_version()}"',
content
)

with open(init_path, "w", encoding="utf-8") as f:
f.write(new_content)
print(f"Updated version to {get_version()} in __init__.py")


if __name__ == "__main__":
update_version_in_init()
1 change: 1 addition & 0 deletions examples/download_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pycsghub.snapshot_download import snapshot_download

# token = "your access token"
token = None

Expand Down
11 changes: 6 additions & 5 deletions examples/download_file.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pycsghub.file_download import file_download

# token = "your access token"
token = None

Expand All @@ -7,11 +8,11 @@
repo_id = 'OpenCSG/csg-wukong-1B'
local_dir = "/Users/hhwang/temp/wukong"
result = file_download(
repo_id,
file_name='README.md',
local_dir=local_dir,
endpoint=endpoint,
token=token,
repo_id,
file_name='README.md',
local_dir=local_dir,
endpoint=endpoint,
token=token,
repo_type=repo_type)

print(f"Save file to {result}")
60 changes: 60 additions & 0 deletions examples/download_file_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging

from pycsghub.file_download import file_download, snapshot_download_parallel

# Configure logging level
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# token = "your access token"
token = None

endpoint = "https://hub.opencsg.com"
repo_type = "model"
repo_id = 'OpenCSG/csg-wukong-1B'
local_dir = "/Users/hhwang/temp/wukong"

print("=== Single-file multi-threaded download example ===")
result = file_download(
repo_id,
file_name='README.md',
local_dir=local_dir,
endpoint=endpoint,
token=token,
repo_type=repo_type,
max_workers=4,
use_parallel=True
)

print(f"Single-file multi-threaded downloaded ,save to: {result}")

print("\n=== Example of multi-threaded download for the entire repository ===")
cache_dir = "/Users/hhwang/temp/"
allow_patterns = ["*.json", "*.md", "*.txt"]

result = snapshot_download_parallel(
repo_id,
repo_type=repo_type,
cache_dir=cache_dir,
endpoint=endpoint,
token=token,
allow_patterns=allow_patterns,
max_workers=6,
use_parallel=True,
verbose=True
)

print(f"Repository downloaded, save to: {result}")

print("\n=== Example of single-threaded download comparison ===")

result_single = file_download(
repo_id,
file_name='README.md',
local_dir=local_dir,
endpoint=endpoint,
token=token,
repo_type=repo_type,
use_parallel=False
)

print(f"Single-threaded downloaded, save to: {result_single}")
10 changes: 5 additions & 5 deletions examples/download_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pycsghub.snapshot_download import snapshot_download

# token = "your access token"
token = None

Expand All @@ -10,13 +11,12 @@
ignore_patterns = ["tokenizer.json"]

result = snapshot_download(
repo_id,
repo_type=repo_type,
local_dir=local_dir,
endpoint=endpoint,
repo_id,
repo_type=repo_type,
local_dir=local_dir,
endpoint=endpoint,
token=token,
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns)

print(f"Save model to {result}")

95 changes: 95 additions & 0 deletions examples/download_with_custom_progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
Demonstrate how to use custom progress display for model download
"""

import time
from datetime import datetime
from pycsghub import snapshot_download


class CustomProgressTracker:
"""Custom progress tracker"""

def __init__(self):
self.start_time = None
self.last_update_time = None

def progress_callback(self, progress_info):
"""Custom progress callback function"""
current_time = datetime.now()

if self.start_time is None:
self.start_time = current_time
self.last_update_time = current_time

time_since_last = (current_time - self.last_update_time).total_seconds()
if time_since_last >= 1.0 or progress_info['current_downloaded'] == progress_info['total_files']:
self._print_progress(progress_info, current_time)
self.last_update_time = current_time

def _print_progress(self, progress_info, current_time):
"""Print progress information"""
total_files = progress_info['total_files']
current_downloaded = progress_info['current_downloaded']
success_count = progress_info['success_count']
failed_count = progress_info['failed_count']
remaining_count = progress_info['remaining_count']

if total_files > 0:
progress_percent = (current_downloaded / total_files) * 100
else:
progress_percent = 0

elapsed_time = (current_time - self.start_time).total_seconds()

if current_downloaded > 0:
avg_time_per_file = elapsed_time / current_downloaded
estimated_remaining = avg_time_per_file * remaining_count
else:
estimated_remaining = 0

bar_length = 30
filled_length = int(bar_length * progress_percent / 100)
bar = '█' * filled_length + '░' * (bar_length - filled_length)

print(f"\r[{bar}] {progress_percent:5.1f}% | "
f"Downloaded: {current_downloaded}/{total_files} | "
f"Success: {success_count} | "
f"Failed: {failed_count} | "
f"Remaining: {remaining_count} | "
f"Elapsed: {elapsed_time:.1f}s | "
f"Estimated remaining: {estimated_remaining:.1f}s", end='', flush=True)

# If download completed, newline
if current_downloaded == total_files:
print() # Newline


def main():
"""
Main function - Demonstrate custom progress tracking
"""
print("Start demonstrating custom progress tracking for model download...")

progress_tracker = CustomProgressTracker()

repo_id = "OpenCSG/csg-wukong-1B"

try:
local_path = snapshot_download(
repo_id=repo_id,
progress_callback=progress_tracker.progress_callback,
verbose=False,
use_parallel=True,
max_workers=4
)

print(f"\n✅ Download completed! Model saved to: {local_path}")

except Exception as e:
print(f"\n❌ Error during download: {e}")


if __name__ == "__main__":
main()
61 changes: 61 additions & 0 deletions examples/download_with_progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python3
"""
Demonstrate how to use the progress callback feature of snapshot_download
"""

import time
from pycsghub import snapshot_download


def progress_callback(progress_info):
"""
Progress callback function
Receives download progress information and prints it
"""
print(f"\n=== Download progress update ===")
print(f"Total files: {progress_info['total_files']}")
print(f"Current downloaded: {progress_info['current_downloaded']}")
print(f"Success count: {progress_info['success_count']}")
print(f"Failed count: {progress_info['failed_count']}")
print(f"Remaining count: {progress_info['remaining_count']}")

if progress_info['successful_files']:
print(f"Recently successful downloaded file: {progress_info['successful_files'][-1]}")

if progress_info['remaining_files']:
print(f"Next file to download: {progress_info['remaining_files'][0]}")

if progress_info['total_files'] > 0:
progress_percent = (progress_info['current_downloaded'] / progress_info['total_files']) * 100
print(f"Overall progress: {progress_percent:.1f}%")

print("=" * 30)


def main():
"""
Main function - Demonstrate download with progress callback
"""
print("Start demonstrating download with progress callback...")

# Example model ID (please replace with actual model ID)
repo_id = "example/model"

try:
# Use progress callback to download model
local_path = snapshot_download(
repo_id=repo_id,
progress_callback=progress_callback,
verbose=True,
use_parallel=True,
max_workers=4
)

print(f"\nDownload completed! Model saved to: {local_path}")

except Exception as e:
print(f"Error during download: {e}")


if __name__ == "__main__":
main()
19 changes: 12 additions & 7 deletions examples/run_finetune_bert.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import Any
import pandas as pd

import pandas as pd
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from transformers import TrainingArguments

from pycsghub.repo_reader import AutoModelForSequenceClassification, AutoTokenizer
from pycsghub.repo_reader import load_dataset
from pycsghub.repo_reader import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

model_id_or_path = "wanghh2000/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=True)
Expand All @@ -18,29 +18,34 @@
access_token = None
raw_datasets = load_dataset(dsPath, dsName, token=access_token)


def get_data_proprocess() -> Any:
def preprocess_function(examples: pd.DataFrame):
def preprocess_function(examples: pd.DataFrame):
ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=100)
ret = {**examples, **ret}
return pd.DataFrame.from_dict(ret)

return preprocess_function


train_dataset = raw_datasets["train"].select(range(20)).map(get_data_proprocess(), batched=True)
eval_dataset = raw_datasets["validation"].select(range(20)).map(get_data_proprocess(), batched=True)


def data_collator() -> Any:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
return data_collator


outputDir = "/Users/hhwang/temp/ff"
args = TrainingArguments(
outputDir,
evaluation_strategy="steps",
save_strategy="steps",
logging_strategy="steps",
logging_steps = 2,
save_steps = 10,
eval_steps = 2,
logging_steps=2,
save_steps=10,
eval_steps=2,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
Expand Down
3 changes: 1 addition & 2 deletions examples/run_wukong_inference.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from pycsghub.repo_reader import AutoModelForCausalLM, AutoTokenizer

mid = 'OpenCSG/csg-wukong-1B'
Expand All @@ -7,4 +6,4 @@

inputs = tokenizer.encode("Write a short story", return_tensors="pt")
outputs = model.generate(inputs)
print('result: ',tokenizer.batch_decode(outputs))
print('result: ', tokenizer.batch_decode(outputs))
2 changes: 1 addition & 1 deletion examples/upload_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
repo_type="dataset",
)

r.upload()
r.upload()
Loading