Skip to content

Commit 8f718e7

Browse files
committed
handle large files, gc and check file before versioning
1 parent e1b24bf commit 8f718e7

File tree

2 files changed

+124
-52
lines changed

2 files changed

+124
-52
lines changed

api/models/Resource.py

Lines changed: 123 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1+
import os
2+
import random
13
import uuid
24
from typing import TYPE_CHECKING, Any, Optional
35

6+
import structlog
47
from django.conf import settings
58
from django.contrib.auth import get_user_model
69
from django.db import models
710
from django.db.models.signals import post_save
811
from django.dispatch import receiver
912
from django.utils.text import slugify
1013

14+
logger = structlog.getLogger(__name__)
15+
1116
from api.managers.dvc_manager import DVCManager
1217
from api.utils.enums import DataType
1318

@@ -113,64 +118,131 @@ class Meta:
113118
db_table = "resource_version"
114119

115120

116-
@receiver(post_save, sender=Resource)
117-
def version_resource_with_dvc(sender, instance: Resource, created, **kwargs):
121+
@receiver(post_save, sender=ResourceFileDetails)
122+
def version_resource_with_dvc(sender, instance: ResourceFileDetails, created, **kwargs):
118123
"""Create a new version using DVC when resource is updated"""
124+
# Initialize DVC manager
125+
dvc = DVCManager(settings.DVC_REPO_PATH)
126+
119127
# Skip if this is just being created (first version)
120128
if created:
121129
# For first version, we just track it
122-
dvc = DVCManager(settings.DVC_REPO_PATH)
123-
dvc_file = dvc.track_resource(instance.resourcefiledetails.file.path)
124-
message = f"Add resource: {instance.name} version {instance.version}"
125-
dvc.commit_version(dvc_file, message)
126-
dvc.tag_version(f"{instance.name}-{instance.version}")
127-
128-
# Create first version record
129-
ResourceVersion.objects.create(
130-
resource=instance,
131-
version_number=instance.version,
132-
change_description=f"Initial version of {instance.name}",
133-
)
130+
try:
131+
# Use chunked mode for large files (over 100MB)
132+
file_size = (
133+
instance.file.size
134+
if hasattr(instance.file, "size")
135+
else os.path.getsize(instance.file.path)
136+
)
137+
use_chunked = file_size > 100 * 1024 * 1024 # 100MB threshold
138+
139+
dvc_file = dvc.track_resource(instance.file.path, chunked=use_chunked)
140+
message = f"Add resource: {instance.resource.name} version {instance.resource.version}"
141+
dvc.commit_version(dvc_file, message)
142+
dvc.tag_version(f"{instance.resource.name}-{instance.resource.version}")
143+
144+
# Create first version record
145+
ResourceVersion.objects.create(
146+
resource=instance.resource,
147+
version_number=instance.resource.version,
148+
change_description=f"Initial version of {instance.resource.name}",
149+
)
150+
except Exception as e:
151+
logger.error(f"Failed to version resource: {str(e)}")
152+
# Continue without versioning if it fails
153+
# This allows the resource to be created even if DVC fails
154+
pass
134155
else:
135-
# For updates, create a new version
136-
# Determine version number (could implement semantic versioning logic)
137-
last_version: Optional[ResourceVersion] = instance.versions.order_by(
138-
"-created_at"
139-
).first()
140-
141-
# Handle case when there are no versions yet
142-
if last_version is None:
143-
new_version = "v1.0"
144-
else:
145-
new_version = _increment_version(last_version.version_number)
146-
147-
# Update using DVC
148-
dvc = DVCManager(settings.DVC_REPO_PATH)
149-
dvc_file = dvc.track_resource(instance.resourcefiledetails.file.path)
150-
message = f"Update resource: {instance.name} to version {new_version}"
151-
dvc.commit_version(dvc_file, message)
152-
dvc.tag_version(f"{instance.name}-{new_version}")
153-
154-
# Create version record
155-
ResourceVersion.objects.create(
156-
resource=instance,
157-
version_number=new_version,
158-
change_description=f"Updated version of {instance.name}",
159-
)
160-
161-
# Update dataset version field
162-
instance.version = new_version
163-
instance.save(update_fields=["version"])
164-
165-
166-
def _increment_version(version: str) -> str:
167-
"""Simple version incrementing logic"""
156+
# For updates, check if the file has actually changed before creating a new version
157+
try:
158+
# Skip versioning if the file hasn't changed
159+
if dvc.verify_file(instance.file.path):
160+
logger.info(
161+
f"No changes detected for {instance.resource.name}, skipping version creation"
162+
)
163+
return
164+
165+
# Determine version number using semantic versioning
166+
last_version: Optional[ResourceVersion] = (
167+
instance.resource.versions.order_by("-created_at").first()
168+
)
169+
170+
# Handle case when there are no versions yet
171+
if last_version is None:
172+
new_version = "v1.0.0"
173+
else:
174+
# Default to minor version increment, could be configurable in the future
175+
new_version = _increment_version(
176+
last_version.version_number, increment_type="minor"
177+
)
178+
179+
# Use chunked mode for large files (over 100MB)
180+
file_size = (
181+
instance.file.size
182+
if hasattr(instance.file, "size")
183+
else os.path.getsize(instance.file.path)
184+
)
185+
use_chunked = file_size > 100 * 1024 * 1024 # 100MB threshold
186+
187+
# Update using DVC
188+
dvc_file = dvc.track_resource(instance.file.path, chunked=use_chunked)
189+
message = (
190+
f"Update resource: {instance.resource.name} to version {new_version}"
191+
)
192+
dvc.commit_version(dvc_file, message)
193+
dvc.tag_version(f"{instance.resource.name}-{new_version}")
194+
195+
# Create version record
196+
ResourceVersion.objects.create(
197+
resource=instance.resource,
198+
version_number=new_version,
199+
change_description=f"Updated version of {instance.resource.name}",
200+
)
201+
202+
# Update resource version field
203+
instance.resource.version = new_version
204+
instance.resource.save(update_fields=["version"])
205+
206+
# Optional: Trigger garbage collection periodically
207+
# This could be moved to a scheduled task instead
208+
if random.random() < 0.05: # 5% chance to run GC on version update
209+
try:
210+
dvc.gc_cache()
211+
except Exception as e:
212+
logger.warning(f"Failed to run garbage collection: {str(e)}")
213+
except Exception as e:
214+
logger.error(f"Failed to update resource version: {str(e)}")
215+
# Continue without versioning if it fails
216+
pass
217+
218+
219+
def _increment_version(version: str, increment_type: str = "minor") -> str:
220+
"""Semantic version incrementing logic
221+
222+
Args:
223+
version: Current version string (e.g., "v1.0.0")
224+
increment_type: One of "major", "minor", or "patch"
225+
226+
Returns:
227+
New version string
228+
"""
168229
if version.startswith("v"):
169230
version = version[1:]
170231

171232
parts = version.split(".")
172-
if len(parts) == 2:
173-
major, minor = map(int, parts)
233+
if len(parts) < 3:
234+
parts = parts + ["0"] * (3 - len(parts))
235+
236+
major, minor, patch = map(int, parts)
237+
238+
if increment_type == "major":
239+
major += 1
240+
minor = 0
241+
patch = 0
242+
elif increment_type == "minor":
174243
minor += 1
175-
return f"v{major}.{minor}"
176-
return f"v1.0"
244+
patch = 0
245+
else: # patch
246+
patch += 1
247+
248+
return f"v{major}.{minor}.{patch}"

api/types/type_usecase.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import TYPE_CHECKING, List, Optional
1+
from typing import List, Optional
22

33
import strawberry
44
import strawberry_django

0 commit comments

Comments
 (0)