1717from os import walk as os_walk
1818from os .path import abspath
1919from os .path import exists
20+ from os .path import isfile
2021from os .path import expanduser
2122from os .path import join
2223from os .path import normpath
3334 temp_dir = tempfile .mkdtemp (prefix = 'scancode-resource-cache' )
3435
3536from commoncode import ignore
36- from commoncode import paths
3737from commoncode .datautils import List
3838from commoncode .datautils import Mapping
3939from commoncode .datautils import String
4545from commoncode .fileutils import file_name
4646from commoncode .fileutils import parent_directory
4747from commoncode .fileutils import splitext_name
48+ from commoncode .paths import split as paths_split
4849
4950"""
5051This module provides Codebase and Resource objects as an abstraction for files
5960"""
6061
6162# Tracing flags
62- TRACE = False
63+ TRACE = True
6364TRACE_DEEP = False
6465
6566
@@ -308,16 +309,18 @@ def __init__(
308309 self ._setup_essentials (temp_dir , max_in_memory )
309310
310311 # finally populate
311- self .paths = self ._clean_paths (paths )
312+ self .paths = self ._prepare_clean_paths (paths )
312313 self ._populate ()
313314
314- def _clean_paths (self , paths = tuple ()):
315+ def _prepare_clean_paths (self , paths = tuple ()):
315316 """
316- Return a new list cleaned ``paths``.
317+ Return a new set of cleaned ``paths`` possibly empty.
318+ We convert to POSIX and ensure we have no slash at both ends.
317319 """
318- paths = paths or []
319- # convert to posix and ensure we have no slash at both ends
320- return [clean_path (p ) for p in paths ]
320+ paths = (clean_path (p ) for p in (paths or []) if p )
321+ # we sort by path segments (e.g. essentially a topo sort)
322+ _sorter = lambda p : p .split ('/' )
323+ return sorted (paths , key = _sorter )
321324
322325 def _setup_essentials (self , temp_dir = temp_dir , max_in_memory = 10000 ):
323326 """
@@ -465,10 +468,79 @@ def _populate(self):
465468 # childless directory
466469 return
467470
471+ if self .paths :
472+ return self ._create_resources_from_paths (root = root , paths = self .paths )
473+ else :
474+ return self ._create_resources_from_root (root = root )
475+
476+ def _create_resources_from_paths (self , root , paths ):
477+ # without paths we iterate the provided paths. We report an error
478+ # if a path is missing on disk.
479+
480+ # !!!NOTE: WE DO NOT skip_ignored in this case!!!!!
481+
482+ base_location = parent_directory (root .location )
483+
484+ # track resources parents by path during construction to avoid
485+ # recreating all ancestor directories
486+ parents_by_path = {root .path : root }
487+
488+ for path in paths :
489+ res_loc = join (base_location , path )
490+ if not exists (res_loc ):
491+ msg = (
492+ f'ERROR: cannot populate codebase: '
493+ f'path: { path !r} not found in { res_loc !r} '
494+ )
495+ self .errors .append (msg )
496+ raise Exception (path , join (base_location , path ))
497+ continue
498+
499+ # create all parents. The last parent is the one we want to use
500+ parent = root
501+ if TRACE :
502+ logger_debug ('Codebase._create_resources_from_paths: parent' , parent )
503+ for parent_path in get_ancestor_paths (path , include_self = False ):
504+ if TRACE :
505+ logger_debug (' Codebase._create_resources_from_paths: parent_path' , repr (parent_path ))
506+ if not parent_path :
507+ continue
508+ newpar = parents_by_path .get (parent_path )
509+ if TRACE :
510+ logger_debug (' Codebase._create_resources_from_paths: newpar' , repr (newpar ))
511+
512+ if not newpar :
513+ newpar = self ._get_or_create_resource (
514+ name = file_name (parent_path ),
515+ parent = parent ,
516+ path = parent_path ,
517+ is_file = False ,
518+ )
519+ if not newpar :
520+ raise Exception (f'ERROR: Codebase._create_resources_from_paths: cannot create parent for: { parent_path } ' )
521+ parent = newpar
522+
523+ parents_by_path [parent_path ] = parent
524+
525+ if TRACE :
526+ logger_debug ('Codebase._create_resources_from_paths: created newpar:' , repr (newpar ))
527+
528+ res = self ._get_or_create_resource (
529+ name = file_name (path ),
530+ parent = parent ,
531+ path = path ,
532+ is_file = isfile (res_loc ),
533+ )
534+ if TRACE :
535+ logger_debug ('Codebase._create_resources_from_paths: resource' , res )
536+
537+ def _create_resources_from_root (self , root ):
538+ # without paths we walks the root location top-down
539+
468540 # track resources parents by location during construction.
469541 # NOTE: this cannot exhaust memory on a large codebase, because we do
470542 # not keep parents already walked and we walk topdown.
471- parent_by_loc = {root .location : root }
543+ parents_by_loc = {root .location : root }
472544
473545 def err (_error ):
474546 """os.walk error handler"""
@@ -483,7 +555,7 @@ def err(_error):
483555 max_depth = self .max_depth ,
484556 error_handler = err ,
485557 ):
486- parent = parent_by_loc .pop (top )
558+ parent = parents_by_loc .pop (top )
487559 for created in self ._create_resources (
488560 parent = parent ,
489561 top = top ,
@@ -492,7 +564,7 @@ def err(_error):
492564 ):
493565 # on the plain, bare FS, files cannot be parents
494566 if not created .is_file :
495- parent_by_loc [created .location ] = created
567+ parents_by_loc [created .location ] = created
496568
497569 def _create_resources (self , parent , top , dirs , files , skip_ignored = skip_ignored ):
498570 """
@@ -575,7 +647,7 @@ def _get_or_create_resource(
575647 Create and return a new codebase Resource with ``path`` and ``location``.
576648 """
577649 if not parent :
578- raise TypeError ('Cannot create resource without parent. ' )
650+ raise TypeError (f 'Cannot create resource without parent: name: { name !r } , path: { path !r } ' )
579651
580652 # If the codebase is virtual, we provide the path
581653 if not path :
@@ -1461,7 +1533,7 @@ def strip_first_path_segment(path):
14611533 >>> strip_first_path_segment('foo/')
14621534 'foo/'
14631535 """
1464- segments = paths . split (path )
1536+ segments = paths_split (path )
14651537 if not segments or len (segments ) == 1 :
14661538 return path
14671539 stripped = segments [1 :]
@@ -1566,7 +1638,7 @@ def __init__(
15661638 self .location = location
15671639
15681640 scan_data = self ._get_scan_data (location )
1569- self .paths = self ._clean_paths (paths )
1641+ self .paths = self ._prepare_clean_paths (paths )
15701642 self ._populate (scan_data )
15711643
15721644 def _get_scan_data_helper (self , location ):
@@ -1754,7 +1826,7 @@ def _populate(self, scan_data):
17541826
17551827 for fdata in files_data :
17561828 sample_resource_data_update (fdata )
1757- segments = fdata ['path' ].split ('/' )
1829+ segments = fdata ['path' ].split ('/' )
17581830 root_names_add (segments [0 ])
17591831 fdata ['path_segments' ] = segments
17601832
@@ -1816,12 +1888,21 @@ def _populate(self, scan_data):
18161888 setattr (root , name , value )
18171889
18181890 if TRACE : logger_debug ('VirtualCodebase.populate: root:' , root )
1891+
1892+ # TODO: report error if filtering the root with a paths?
18191893 self .save_resource (root )
18201894
18211895 if self .has_single_resource :
18221896 if TRACE : logger_debug ('VirtualCodebase.populate: with single resource.' )
18231897 return
18241898
1899+ all_paths = None
1900+ if self .paths :
1901+ # build a set of all all paths and all their ancestors
1902+ all_paths = set ()
1903+ for path in self .paths :
1904+ all_paths .update (get_ancestor_paths (path , include_self = True ))
1905+
18251906 # Create other Resources from scan info
18261907
18271908 # Note that we do not know the ordering there.
@@ -1833,10 +1914,15 @@ def _populate(self, scan_data):
18331914 duplicated_paths = set ()
18341915 last_path = None
18351916 for fdata in files_data :
1917+ path = fdata .get ('path' )
1918+
1919+ # skip the ones we did not request
1920+ if all_paths and path not in all_paths :
1921+ continue
1922+
18361923 # these are no longer needed
18371924 path_segments = fdata .pop ('path_segments' )
18381925
1839- path = fdata .get ('path' )
18401926 if not last_path :
18411927 last_path = path
18421928 elif last_path == path :
@@ -1935,3 +2021,27 @@ def remove_properties_and_basics(resource_data):
19352021 mapping with the known properties removed.
19362022 """
19372023 return {k : v for k , v in resource_data .items () if k not in KNOW_PROPS }
2024+
2025+
2026+ def get_ancestor_paths (path , include_self = False ):
2027+ """
2028+ Yield all subpaths from a POSIX path.
2029+
2030+ For example::
2031+ >>> path = 'foo/bar/baz'
2032+ >>> results = list(get_ancestor_paths(path))
2033+ >>> assert results == ['foo', 'foo/bar'], results
2034+ >>> results = list(get_ancestor_paths(path, include_self=True))
2035+ >>> assert results == ['foo', 'foo/bar', 'foo/bar/baz'], results
2036+ >>> results = list(get_ancestor_paths('foo', include_self=False))
2037+ >>> assert results == [], results
2038+ """
2039+ assert path
2040+ segments = path .split ('/' )
2041+ if not include_self :
2042+ segments = segments [:- 1 ]
2043+ subpath = []
2044+ for segment in segments :
2045+ subpath .append (segment )
2046+ yield '/' .join (subpath )
2047+
0 commit comments