88from typing import Optional
99
1010import django
11+ import fsspec
1112import msoffcrypto
1213import openpyxl
1314import pandas as pd
@@ -123,6 +124,16 @@ def completed_bss_metadata(config: BSSMetadataConfig, bss_metadata) -> Output[pd
123124 """
124125 A DataFrame containing the BSS Metadata that has been completed sufficiently to allow the BSS to be loaded.
125126 """
127+ # Exclude BSS where the listed BSS file does not exist
128+ protocol , bss_files_folder = config .bss_files_folder .split ("://" )
129+ # The listings cache is disabled because otherwise gdrivefs returns an empty list of files
130+ fs = fsspec .filesystem (protocol , use_listings_cache = False , ** config .bss_files_storage_options )
131+ bss_paths = []
132+ for dirpath , dirnames , filenames in fs .walk (bss_files_folder ):
133+ bss_paths .extend ([os .path .join (dirpath , filename )[len (bss_files_folder ) + 1 :] for filename in filenames ])
134+ bss_metadata = bss_metadata [bss_metadata ["bss_path" ].isin (bss_paths )].sort_values (by = "bss_path" )
135+
136+ # Exclude those BSS where one or more mandatory columns have not been completed
126137 required_columns = [
127138 "bss_path" ,
128139 "code" ,
@@ -135,10 +146,8 @@ def completed_bss_metadata(config: BSSMetadataConfig, bss_metadata) -> Output[pd
135146 "reference_year_end_date" ,
136147 "valid_from_date" ,
137148 ]
138- bss_metadata = bss_metadata [bss_metadata ["bss_exists" ]].sort_values (by = "bss_path" )
139149 mask = bss_metadata [required_columns ].map (lambda x : x == "" )
140150
141- # Drop rows where any of the specified columns have empty strings
142151 complete_df = bss_metadata [~ mask .any (axis = "columns" )]
143152 incomplete_df = bss_metadata [mask .any (axis = "columns" )]
144153
0 commit comments