Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 49 additions & 23 deletions lib/rift/Annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def is_binary(filepath, blocksize=65536):

def hashfile(filepath, iosize=65536):
"""Compute a digest of filepath content."""
hasher = hashlib.md5()
hasher = hashlib.sha3_256()
with open(filepath, 'rb') as srcfile:
buf = srcfile.read(iosize)
while len(buf) > 0:
Expand Down Expand Up @@ -123,10 +123,20 @@ def is_pointer(cls, filepath):
identifier.
"""
meta = os.stat(filepath)

# MD5
if meta.st_size == 32:
Comment on lines +127 to 128
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: is there no other way to know the type of hash being used ? Feels really fragile

logging.warning("Using deprecated hash algorithm (MD5)")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: don't think adding this warning is important, since it will be normal to have it until all file paths are migrated to sha3, at which point this shouldn't occur anymore

with open(filepath, encoding='utf-8') as fh:
identifier = fh.read(32)
return all(byte in string.hexdigits for byte in identifier)

# SHA3 256
elif meta.st_size == 64:
with open(filepath, encoding='utf-8') as fh:
identifier = fh.read(64)
return all(byte in string.hexdigits for byte in identifier)

Comment on lines 128 to +139
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: merge the two conditions

return False

def get(self, identifier, destpath):
Expand Down Expand Up @@ -226,13 +236,29 @@ def list(self):
if not filename.endswith('.info'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest: change the condition to check it ends with a .info and continue if true

info = self._load_metadata(filename)
names = info.get('filenames', [])
for annexed_file in names.values():
insertion_time = annexed_file['date']
insertion_time = datetime.datetime.strptime(insertion_time, "%c").timestamp()

#The file size must come from the filesystem
meta = os.stat(os.path.join(self.path, filename))
yield filename, meta.st_size, insertion_time, names
for annexed_file, details in names.items():
insertion_time = details['date']

# Handle different date formats (old method)
if isinstance(insertion_time, str):
for fmt in ('%a %b %d %H:%M:%S %Y', '%a %d %b %Y %H:%M:%S %p %Z'):
try:
insertion_time = datetime.datetime.strptime(insertion_time, fmt).timestamp()
break
except ValueError:
continue
else:
raise ValueError(f"Invalid date format in metadata: {insertion_time}")

# UNIX timestamp
elif isinstance(insertion_time, (int, float)):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: is the float type really possible ? Seems unlikely for a timestamp

insertion_time = insertion_time
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

major: don't think this line is really necessary

else:
raise ValueError("Invalid date format in metadata")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest: print the type of the object


# The file size must come from the filesystem
meta = os.stat(os.path.join(self.path, filename))
yield filename, meta.st_size, insertion_time, [annexed_file]

def push(self, filepath):
"""
Expand All @@ -254,21 +280,21 @@ def push(self, filepath):
destinfo = None
if os.path.exists(destpath):
destinfo = os.stat(destpath)
if destinfo and destinfo.st_size == originfo.st_size and \
filename in metadata.get('filenames', {}):
logging.debug('%s is already into annex, skipping it', filename)

else:
# Update them and write them back
fileset = metadata.setdefault('filenames', {})
fileset.setdefault(filename, {})
fileset[filename]['date'] = time.strftime("%c")
self._save_metadata(digest, metadata)

# Move binary file to annex
logging.debug('Importing %s into annex (%s)', filepath, digest)
shutil.copyfile(filepath, destpath)
os.chmod(destpath, self.WMODE)
if destinfo and destinfo.st_size == originfo.st_size and \
filename in metadata.get('filenames', {}):
logging.debug('%s is already into annex, skipping it', filename)
return
Comment on lines 281 to +286
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: shouldn't we return an error or at least continue if os.path.exists(destpath) is false ?


# Update them and write them back
fileset = metadata.setdefault('filenames', {})
fileset.setdefault(filename, {})
fileset[filename]['date'] = time.time() # Unix timestamp
self._save_metadata(digest, metadata)

# Move binary file to annex
logging.debug('Importing %s into annex (%s)', filepath, digest)
shutil.copyfile(filepath, destpath)
os.chmod(destpath, self.WMODE)

# Verify permission are correct before copying
os.chmod(filepath, self.RMODE)
Expand Down
13 changes: 4 additions & 9 deletions tests/Annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,17 @@ def test_delete(self):
self.annex.get_by_path(source_file.name, '/dev/null')

def test_list(self):
""" Test list method """
"""Test the list method"""

source_size = os.stat(self.source.name).st_size
source_insertion_time = datetime.datetime.strptime(time.strftime('%c'), '%c').timestamp()
# Get the current time with the %c format and convert it to unix timestamp to
# have the same method as annex.list (in terms of precision)
source_insertion_time = time.time()
self.annex.push(self.source.name)

# Check if the file pointer is present in the annex list output
# by checking it's attributes
for filename, size, insertion_time, names in self.annex.list():
self.assertEqual(get_digest_from_path(self.source.name), filename)
self.assertEqual(source_size, size)
# As tests can take time to run, accept less or equal 1 second shift
self.assertAlmostEqual(source_insertion_time, insertion_time, delta=1)
self.assertTrue(os.path.basename(self.source.name) in names.keys())
self.assertAlmostEqual(source_insertion_time, insertion_time, delta=1) # delta for potentials delay
self.assertTrue(os.path.basename(self.source.name) in names)

def test_push(self):
""" Test push method """
Expand Down