@@ -869,27 +869,33 @@ def remote_worker(
869869 logging .info ("Total tokens calculated: %d" , tokens )
870870
871871 # Build directory structure from loaded documents
872- # Format matches local file uploads: flat structure with type, size_bytes, token_count
872+ # Format matches local file uploads: nested structure with type, size_bytes, token_count
873873 directory_structure = {}
874874 for doc in raw_docs :
875- # Get the file path/name from doc_id or extra_info
876- file_path = doc .doc_id or ""
877- if not file_path and doc .extra_info :
878- file_path = doc .extra_info .get ("key" , "" ) or doc .extra_info .get (
879- "title" , ""
875+ # Get the file path from extra_info
876+ # For crawlers: file_path is a virtual path like "guides/setup.md"
877+ # For other remotes: use key or title as fallback
878+ file_path = ""
879+ if doc .extra_info :
880+ file_path = (
881+ doc .extra_info .get ("file_path" , "" )
882+ or doc .extra_info .get ("key" , "" )
883+ or doc .extra_info .get ("title" , "" )
880884 )
885+ if not file_path :
886+ file_path = doc .doc_id or ""
881887
882888 if file_path :
883- # Use just the filename (last part of path) for flat structure
884- file_name = file_path .split ("/" )[- 1 ] if "/" in file_path else file_path
885-
886889 # Calculate token count
887- token_count = len (doc .text . split () ) if doc .text else 0
890+ token_count = num_tokens_from_string (doc .text ) if doc .text else 0
888891
889892 # Estimate size in bytes from text content
890893 size_bytes = len (doc .text .encode ("utf-8" )) if doc .text else 0
891894
892895 # Guess mime type from extension
896+ file_name = (
897+ file_path .split ("/" )[- 1 ] if "/" in file_path else file_path
898+ )
893899 ext = os .path .splitext (file_name )[1 ].lower ()
894900 mime_types = {
895901 ".txt" : "text/plain" ,
@@ -909,11 +915,23 @@ def remote_worker(
909915 }
910916 file_type = mime_types .get (ext , "application/octet-stream" )
911917
912- directory_structure [file_name ] = {
913- "type" : file_type ,
914- "size_bytes" : size_bytes ,
915- "token_count" : token_count ,
916- }
918+ # Build nested directory structure from path
919+ # e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}}
920+ path_parts = file_path .split ("/" )
921+ current_level = directory_structure
922+ for i , part in enumerate (path_parts ):
923+ if i == len (path_parts ) - 1 :
924+ # Last part is the file
925+ current_level [part ] = {
926+ "type" : file_type ,
927+ "size_bytes" : size_bytes ,
928+ "token_count" : token_count ,
929+ }
930+ else :
931+ # Intermediate parts are directories
932+ if part not in current_level :
933+ current_level [part ] = {}
934+ current_level = current_level [part ]
917935
918936 logging .info (
919937 f"Built directory structure with { len (directory_structure )} files: "
0 commit comments