@@ -2369,3 +2369,301 @@ def cast(
23692369 ```
23702370 """
23712371 return IterableDatasetDict ({k : dataset .cast (features = features ) for k , dataset in self .items ()})
2372+
2373+ def push_to_hub (
2374+ self ,
2375+ repo_id ,
2376+ config_name : str = "default" ,
2377+ set_default : Optional [bool ] = None ,
2378+ data_dir : Optional [str ] = None ,
2379+ commit_message : Optional [str ] = None ,
2380+ commit_description : Optional [str ] = None ,
2381+ private : Optional [bool ] = None ,
2382+ token : Optional [str ] = None ,
2383+ revision : Optional [str ] = None ,
2384+ create_pr : Optional [bool ] = False ,
2385+ # max_shard_size: Optional[Union[int, str]] = None, # TODO(QL): add arg
2386+ num_shards : Optional [dict [str , int ]] = None ,
2387+ embed_external_files : bool = True ,
2388+ ) -> CommitInfo :
2389+ """Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
2390+ The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.
2391+
2392+ Each dataset split will be pushed independently. The pushed dataset will keep the original split names.
2393+
2394+ The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`Audio`]
2395+ data, the Parquet files will store the bytes of your images or audio files.
2396+ You can disable this by setting `embed_external_files` to False.
2397+
2398+ Args:
2399+ repo_id (`str`):
2400+ The ID of the repository to push to in the following format: `<user>/<dataset_name>` or
2401+ `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace
2402+ of the logged-in user.
2403+ config_name (`str`):
2404+ Configuration name of a dataset. Defaults to "default".
2405+ set_default (`bool`, *optional*):
2406+ Whether to set this configuration as the default one. Otherwise, the default configuration is the one
2407+ named "default".
2408+ data_dir (`str`, *optional*):
2409+ Directory name that will contain the uploaded data files. Defaults to the `config_name` if different
2410+ from "default", else "data".
2411+
2412+ <Added version="2.17.0"/>
2413+ commit_message (`str`, *optional*):
2414+ Message to commit while pushing. Will default to `"Upload dataset"`.
2415+ commit_description (`str`, *optional*):
2416+ Description of the commit that will be created.
2417+ Additionally, description of the PR if a PR is created (`create_pr` is True).
2418+
2419+ <Added version="2.16.0"/>
2420+ private (`bool`, *optional*):
2421+ Whether to make the repo private. If `None` (default), the repo will be public unless the
2422+ organization's default is private. This value is ignored if the repo already exists.
2423+ token (`str`, *optional*):
2424+ An optional authentication token for the Hugging Face Hub. If no token is passed, will default
2425+ to the token saved locally when logging in with `huggingface-cli login`. Will raise an error
2426+ if no token is passed and the user is not logged-in.
2427+ revision (`str`, *optional*):
2428+ Branch to push the uploaded files to. Defaults to the `"main"` branch.
2429+ create_pr (`bool`, *optional*, defaults to `False`):
2430+ Whether to create a PR with the uploaded files or directly commit.
2431+ num_shards (`Dict[str, int]`, *optional*):
2432+ Number of shards to write. Equals to this dataset's `.num_shards` by default.
2433+ Use a dictionary to define a different num_shards for each split.
2434+ embed_external_files (`bool`, defaults to `True`):
2435+ Whether to embed file bytes in the shards.
2436+ In particular, this will do the following before the push for the fields of type:
2437+
2438+ - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.
2439+
2440+ Return:
2441+ huggingface_hub.CommitInfo
2442+
2443+ Example:
2444+
2445+ ```python
2446+ >>> dataset_dict.push_to_hub("<organization>/<dataset_id>")
2447+ >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", private=True)
2448+ >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", num_shards={"train": 1024, "test": 8})
2449+ ```
2450+
2451+ If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):
2452+
2453+ ```python
2454+ >>> english_dataset.push_to_hub("<organization>/<dataset_id>", "en")
2455+ >>> french_dataset.push_to_hub("<organization>/<dataset_id>", "fr")
2456+ >>> # later
2457+ >>> english_dataset = load_dataset("<organization>/<dataset_id>", "en")
2458+ >>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
2459+ ```
2460+ """
2461+ if num_shards is None :
2462+ num_shards = dict .fromkeys (self )
2463+ elif not isinstance (num_shards , dict ):
2464+ raise ValueError (
2465+ "Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}"
2466+ )
2467+
2468+ self ._check_values_type ()
2469+ self ._check_values_features ()
2470+ total_uploaded_size = 0
2471+ total_dataset_nbytes = 0
2472+ info_to_dump : DatasetInfo = next (iter (self .values ())).info .copy ()
2473+ info_to_dump .config_name = config_name
2474+ info_to_dump .splits = SplitDict ()
2475+
2476+ for split in self .keys ():
2477+ if not re .match (_split_re , split ):
2478+ raise ValueError (f"Split name should match '{ _split_re } ' but got '{ split } '." )
2479+
2480+ api = HfApi (endpoint = config .HF_ENDPOINT , token = token )
2481+
2482+ repo_url = api .create_repo (
2483+ repo_id ,
2484+ token = token ,
2485+ repo_type = "dataset" ,
2486+ private = private ,
2487+ exist_ok = True ,
2488+ )
2489+ repo_id = repo_url .repo_id
2490+
2491+ if revision is not None and not revision .startswith ("refs/pr/" ):
2492+ # We do not call create_branch for a PR reference: 400 Bad Request
2493+ api .create_branch (
2494+ repo_id ,
2495+ branch = revision ,
2496+ token = token ,
2497+ repo_type = "dataset" ,
2498+ exist_ok = True ,
2499+ )
2500+
2501+ if not data_dir :
2502+ data_dir = config_name if config_name != "default" else "data" # for backward compatibility
2503+
2504+ additions = []
2505+ for split in self .keys ():
2506+ logger .info (f"Pushing split { split } to the Hub." )
2507+ # The split=key needs to be removed before merging
2508+ split_additions , uploaded_size , dataset_nbytes = self [split ]._push_parquet_shards_to_hub (
2509+ repo_id ,
2510+ data_dir = data_dir ,
2511+ split = split ,
2512+ token = token ,
2513+ revision = revision ,
2514+ create_pr = create_pr ,
2515+ # max_shard_size=max_shard_size, # TODO(QL): add arg
2516+ num_shards = num_shards .get (split ),
2517+ embed_external_files = embed_external_files ,
2518+ )
2519+ additions += split_additions
2520+ total_uploaded_size += uploaded_size
2521+ total_dataset_nbytes += dataset_nbytes
2522+ info_to_dump .splits [split ] = SplitInfo (str (split ), num_bytes = dataset_nbytes , num_examples = len (self [split ]))
2523+ info_to_dump .download_checksums = None
2524+ info_to_dump .download_size = total_uploaded_size
2525+ info_to_dump .dataset_size = total_dataset_nbytes
2526+ info_to_dump .size_in_bytes = total_uploaded_size + total_dataset_nbytes
2527+
2528+ # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern)
2529+ # and delete old split shards (if they exist)
2530+ repo_with_dataset_card , repo_with_dataset_infos = False , False
2531+ repo_splits : list [str ] = [] # use a list to keep the order of the splits
2532+ deletions : list [CommitOperationDelete ] = []
2533+ repo_files_to_add = [addition .path_in_repo for addition in additions ]
2534+ for repo_file in api .list_repo_tree (
2535+ repo_id = repo_id ,
2536+ revision = revision ,
2537+ repo_type = "dataset" ,
2538+ token = token ,
2539+ recursive = True ,
2540+ ):
2541+ if not isinstance (repo_file , RepoFile ):
2542+ continue
2543+ if repo_file .rfilename == config .REPOCARD_FILENAME :
2544+ repo_with_dataset_card = True
2545+ elif repo_file .rfilename == config .DATASETDICT_INFOS_FILENAME :
2546+ repo_with_dataset_infos = True
2547+ elif (
2548+ repo_file .rfilename .startswith (tuple (f"{ data_dir } /{ split } -" for split in self .keys ()))
2549+ and repo_file .rfilename not in repo_files_to_add
2550+ ):
2551+ deletions .append (CommitOperationDelete (path_in_repo = repo_file .rfilename ))
2552+ elif fnmatch .fnmatch (
2553+ repo_file .rfilename ,
2554+ PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED .replace ("{split}" , "*" ),
2555+ ):
2556+ pattern = glob_pattern_to_regex (PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED )
2557+ split_pattern_fields = string_to_dict (repo_file .rfilename , pattern )
2558+ assert split_pattern_fields is not None
2559+ repo_split = split_pattern_fields ["split" ]
2560+ if repo_split not in repo_splits :
2561+ repo_splits .append (repo_split )
2562+
2563+ # get the info from the README to update them
2564+ if repo_with_dataset_card :
2565+ dataset_card_path = api .hf_hub_download (
2566+ repo_id ,
2567+ config .REPOCARD_FILENAME ,
2568+ repo_type = "dataset" ,
2569+ revision = revision ,
2570+ )
2571+ dataset_card = DatasetCard .load (Path (dataset_card_path ))
2572+ dataset_card_data = dataset_card .data
2573+ metadata_configs = MetadataConfigs .from_dataset_card_data (dataset_card_data )
2574+ # get the deprecated dataset_infos.json to update them
2575+ elif repo_with_dataset_infos :
2576+ dataset_card = None
2577+ dataset_card_data = DatasetCardData ()
2578+ metadata_configs = MetadataConfigs ()
2579+ else :
2580+ dataset_card = None
2581+ dataset_card_data = DatasetCardData ()
2582+ metadata_configs = MetadataConfigs ()
2583+ # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed
2584+ if not metadata_configs and repo_splits :
2585+ default_metadata_configs_to_dump = {
2586+ "data_files" : [{"split" : split , "path" : f"data/{ split } -*" } for split in repo_splits ]
2587+ }
2588+ MetadataConfigs ({"default" : default_metadata_configs_to_dump }).to_dataset_card_data (dataset_card_data )
2589+ metadata_config_to_dump = {
2590+ "data_files" : [{"split" : split , "path" : f"{ data_dir } /{ split } -*" } for split in self .keys ()],
2591+ }
2592+ configs_to_dump = {config_name : metadata_config_to_dump }
2593+ if set_default and config_name != "default" :
2594+ if metadata_configs :
2595+ current_default_config_name = metadata_configs .get_default_config_name ()
2596+ if current_default_config_name == "default" :
2597+ raise ValueError (
2598+ "There exists a configuration named 'default'. To set a different configuration as default, "
2599+ "rename the 'default' one first."
2600+ )
2601+ if current_default_config_name :
2602+ _ = metadata_configs [current_default_config_name ].pop ("default" )
2603+ configs_to_dump [current_default_config_name ] = metadata_configs [current_default_config_name ]
2604+ metadata_config_to_dump ["default" ] = True
2605+ # push to the deprecated dataset_infos.json
2606+ if repo_with_dataset_infos :
2607+ dataset_infos_path = api .hf_hub_download (
2608+ repo_id ,
2609+ config .DATASETDICT_INFOS_FILENAME ,
2610+ repo_type = "dataset" ,
2611+ revision = revision ,
2612+ )
2613+ with open (dataset_infos_path , encoding = "utf-8" ) as f :
2614+ dataset_infos : dict = json .load (f )
2615+ dataset_infos [config_name ] = asdict (info_to_dump )
2616+ additions .append (
2617+ CommitOperationAdd (
2618+ path_in_repo = config .DATASETDICT_INFOS_FILENAME ,
2619+ path_or_fileobj = json .dumps (dataset_infos , indent = 4 ).encode ("utf-8" ),
2620+ )
2621+ )
2622+ # push to README
2623+ DatasetInfosDict ({config_name : info_to_dump }).to_dataset_card_data (dataset_card_data )
2624+ MetadataConfigs (configs_to_dump ).to_dataset_card_data (dataset_card_data )
2625+ dataset_card = DatasetCard (f"---\n { dataset_card_data } \n ---\n " ) if dataset_card is None else dataset_card
2626+ additions .append (
2627+ CommitOperationAdd (
2628+ path_in_repo = config .REPOCARD_FILENAME ,
2629+ path_or_fileobj = str (dataset_card ).encode (),
2630+ )
2631+ )
2632+
2633+ commit_message = commit_message if commit_message is not None else "Upload dataset"
2634+ if len (additions ) <= config .UPLOADS_MAX_NUMBER_PER_COMMIT :
2635+ commit_info = api .create_commit (
2636+ repo_id ,
2637+ operations = additions + deletions ,
2638+ commit_message = commit_message ,
2639+ commit_description = commit_description ,
2640+ token = token ,
2641+ repo_type = "dataset" ,
2642+ revision = revision ,
2643+ create_pr = create_pr ,
2644+ )
2645+ else :
2646+ logger .info (
2647+ f"Number of files to upload is larger than { config .UPLOADS_MAX_NUMBER_PER_COMMIT } . Splitting the push into multiple commits."
2648+ )
2649+ num_commits = math .ceil (len (additions ) / config .UPLOADS_MAX_NUMBER_PER_COMMIT )
2650+ for i in range (0 , num_commits ):
2651+ operations = additions [
2652+ i * config .UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1 ) * config .UPLOADS_MAX_NUMBER_PER_COMMIT
2653+ ] + (deletions if i == 0 else [])
2654+ commit_info = api .create_commit (
2655+ repo_id ,
2656+ operations = operations ,
2657+ commit_message = commit_message + f" (part { i :05d} -of-{ num_commits :05d} )" ,
2658+ commit_description = commit_description ,
2659+ token = token ,
2660+ repo_type = "dataset" ,
2661+ revision = revision ,
2662+ create_pr = create_pr ,
2663+ )
2664+ logger .info (
2665+ f"Commit #{ i + 1 } completed"
2666+ + (f" (still { num_commits - i - 1 } to go)" if num_commits - i - 1 else "" )
2667+ + "."
2668+ )
2669+ return commit_info
0 commit comments