1313"""Utilities to support workflow."""
1414from __future__ import absolute_import
1515
16+ from pathlib import Path
1617from typing import List , Sequence , Union
1718import hashlib
1819from urllib .parse import unquote , urlparse
20+ from _hashlib import HASH as Hash
1921
2022from sagemaker .workflow .entities import (
2123 Entity ,
2224 RequestType ,
2325)
2426from sagemaker .workflow .step_collections import StepCollection
2527
28+ BUF_SIZE = 65536 # 64KiB
29+
2630
2731def list_to_request (entities : Sequence [Union [Entity , StepCollection ]]) -> List [RequestType ]:
2832 """Get the request structure for list of entities.
@@ -49,15 +53,82 @@ def hash_file(path: str) -> str:
4953 Returns:
5054 str: The MD5 hash of the file.
5155 """
52- BUF_SIZE = 65536 # read in 64KiB chunks
56+ return _hash_file (path , hashlib .md5 ()).hexdigest ()
57+
58+
59+ def hash_files_or_dirs (paths : List [str ]) -> str :
60+ """Get the MD5 hash of the contents of a list of files or directories.
61+
62+ Hash is changed if:
63+ * input list is changed
64+ * new nested directories/files are added to any directory in the input list
65+ * nested directory/file names are changed for any of the inputted directories
66+ * content of files is edited
67+
68+ Args:
69+ paths: List of file or directory paths
70+ Returns:
71+ str: The MD5 hash of the list of files or directories.
72+ """
5373 md5 = hashlib .md5 ()
54- if path .lower ().startswith ("file://" ):
74+ for path in sorted (paths ):
75+ md5 = _hash_file_or_dir (path , md5 )
76+ return md5 .hexdigest ()
77+
78+
79+ def _hash_file_or_dir (path : str , md5 : Hash ) -> Hash :
80+ """Updates the inputted Hash with the contents of the current path.
81+
82+ Args:
83+ path: path of file or directory
84+ Returns:
85+ str: The MD5 hash of the file or directory
86+ """
87+ if isinstance (path , str ) and path .lower ().startswith ("file://" ):
5588 path = unquote (urlparse (path ).path )
56- with open (path , "rb" ) as f :
89+ md5 .update (path .encode ())
90+ if Path (path ).is_dir ():
91+ md5 = _hash_dir (path , md5 )
92+ elif Path (path ).is_file ():
93+ md5 = _hash_file (path , md5 )
94+ return md5
95+
96+
97+ def _hash_dir (directory : Union [str , Path ], md5 : Hash ) -> Hash :
98+ """Updates the inputted Hash with the contents of the current path.
99+
100+ Args:
101+ directory: path of the directory
102+ Returns:
103+ str: The MD5 hash of the directory
104+ """
105+ if not Path (directory ).is_dir ():
106+ raise ValueError (str (directory ) + " is not a valid directory" )
107+ for path in sorted (Path (directory ).iterdir ()):
108+ md5 .update (path .name .encode ())
109+ if path .is_file ():
110+ md5 = _hash_file (path , md5 )
111+ elif path .is_dir ():
112+ md5 = _hash_dir (path , md5 )
113+ return md5
114+
115+
116+ def _hash_file (file : Union [str , Path ], md5 : Hash ) -> Hash :
117+ """Updates the inputted Hash with the contents of the current path.
118+
119+ Args:
120+ file: path of the file
121+ Returns:
122+ str: The MD5 hash of the file
123+ """
124+ if isinstance (file , str ) and file .lower ().startswith ("file://" ):
125+ file = unquote (urlparse (file ).path )
126+ if not Path (file ).is_file ():
127+ raise ValueError (str (file ) + " is not a valid file" )
128+ with open (file , "rb" ) as f :
57129 while True :
58130 data = f .read (BUF_SIZE )
59131 if not data :
60132 break
61133 md5 .update (data )
62-
63- return md5 .hexdigest ()
134+ return md5
0 commit comments