Skip to content

Commit e1b24bf

Browse files
committed
add management command to setup dvc
1 parent a1faa03 commit e1b24bf

File tree

2 files changed

+129
-0
lines changed

2 files changed

+129
-0
lines changed

DataSpace/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,8 @@
246246

247247
# DVC settings
248248
DVC_REPO_PATH = os.path.join(BASE_DIR, "dvc")
249+
DVC_REMOTE_NAME = os.getenv("DVC_REMOTE_NAME", None)
250+
DVC_REMOTE_URL = os.getenv("DVC_REMOTE_URL", None)
249251

250252
# Django REST Framework settings
251253
REST_FRAMEWORK = {
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import os
2+
import subprocess
3+
from typing import Any, Optional, cast
4+
5+
import structlog
6+
from django.conf import settings
7+
from django.core.management.base import BaseCommand
8+
9+
logger = structlog.getLogger(__name__)
10+
11+
12+
class Command(BaseCommand):
13+
help = "Set up DVC repository for dataset versioning"
14+
15+
def handle(self, *args: Any, **options: Any) -> None:
16+
try:
17+
repo_path = settings.DVC_REPO_PATH
18+
19+
# Create directory if needed
20+
if not os.path.exists(repo_path):
21+
os.makedirs(repo_path)
22+
self.stdout.write(f"Created DVC repository directory at {repo_path}")
23+
24+
# Initialize Git and DVC if not already done
25+
if not os.path.exists(os.path.join(repo_path, ".git")):
26+
subprocess.run(["git", "init"], cwd=repo_path, check=True)
27+
self.stdout.write("Initialized Git repository")
28+
29+
if not os.path.exists(os.path.join(repo_path, ".dvc")):
30+
subprocess.run(["dvc", "init"], cwd=repo_path, check=True)
31+
self.stdout.write("Initialized DVC repository")
32+
33+
# Configure chunking for large files
34+
subprocess.run(
35+
["dvc", "config", "cache.type", "hardlink,symlink"],
36+
cwd=repo_path,
37+
check=True,
38+
)
39+
subprocess.run(
40+
["dvc", "config", "cache.shared", "group"],
41+
cwd=repo_path,
42+
check=True,
43+
)
44+
45+
# Configure cache size limits to prevent excessive disk usage
46+
subprocess.run(
47+
["dvc", "config", "cache.size_limit", "10G"],
48+
cwd=repo_path,
49+
check=True,
50+
)
51+
52+
self.stdout.write("Configured DVC for large file handling")
53+
54+
# Set up remote if configured
55+
if (
56+
hasattr(settings, "DVC_REMOTE_NAME")
57+
and hasattr(settings, "DVC_REMOTE_URL")
58+
and settings.DVC_REMOTE_NAME
59+
and settings.DVC_REMOTE_URL
60+
):
61+
# Cast to str to satisfy mypy
62+
remote_name = cast(str, settings.DVC_REMOTE_NAME)
63+
remote_url = cast(str, settings.DVC_REMOTE_URL)
64+
65+
subprocess.run(
66+
[
67+
"dvc",
68+
"remote",
69+
"add",
70+
remote_name,
71+
remote_url,
72+
],
73+
cwd=repo_path,
74+
check=True,
75+
)
76+
77+
# Set as default remote
78+
subprocess.run(
79+
["dvc", "remote", "default", remote_name],
80+
cwd=repo_path,
81+
check=True,
82+
)
83+
84+
self.stdout.write(f"Configured DVC remote: {remote_name}")
85+
86+
# Set up Git user if not already configured
87+
try:
88+
# Check if Git user is configured
89+
result = subprocess.run(
90+
["git", "config", "user.name"],
91+
cwd=repo_path,
92+
capture_output=True,
93+
text=True,
94+
)
95+
if not result.stdout.strip():
96+
subprocess.run(
97+
["git", "config", "user.name", "DataEx Bot"],
98+
cwd=repo_path,
99+
check=True,
100+
)
101+
subprocess.run(
102+
["git", "config", "user.email", "[email protected]"],
103+
cwd=repo_path,
104+
check=True,
105+
)
106+
self.stdout.write("Configured Git user for DVC repository")
107+
except subprocess.CalledProcessError:
108+
# If checking fails, set the user anyway
109+
subprocess.run(
110+
["git", "config", "user.name", "DataEx Bot"],
111+
cwd=repo_path,
112+
check=True,
113+
)
114+
subprocess.run(
115+
["git", "config", "user.email", "[email protected]"],
116+
cwd=repo_path,
117+
check=True,
118+
)
119+
self.stdout.write("Configured Git user for DVC repository")
120+
121+
self.stdout.write(self.style.SUCCESS("DVC repository set up successfully"))
122+
123+
except Exception as e:
124+
logger.error(f"Failed to set up DVC repository: {str(e)}")
125+
self.stdout.write(
126+
self.style.ERROR(f"Failed to set up DVC repository: {str(e)}")
127+
)

0 commit comments

Comments
 (0)