|
| 1 | +project: "dpo_example" |
| 2 | +name: "trinity_dpo" |
| 3 | +mode: train |
| 4 | + |
| 5 | +# using task pipeline to decide the chosen and rejected from human preference |
| 6 | +data_processor: |
| 7 | + # task pipeline related |
| 8 | + task_pipeline: |
| 9 | + num_process: 1 |
| 10 | + operators: |
| 11 | + - name: "human_preference_annotation_mapper" |
| 12 | + args: |
| 13 | + # general annotation project settings |
| 14 | + project_name_prefix: "Human_Preference_Annotation_Demo" |
| 15 | + wait_for_annotations: true # Whether to wait for annotations to complete |
| 16 | + timeout: 3600 # Maximum time to wait for annotations in seconds (1 hour) |
| 17 | + poll_interval: 10 # Time between annotation status checks in seconds |
| 18 | + max_tasks_per_batch: 10 # Maximum number of tasks in a single batch |
| 19 | + notification_config: |
| 20 | + enabled: false |
| 21 | + |
| 22 | + # label studio connection settings |
| 23 | + api_url: "http://localhost:7070" # Default Label Studio URL |
| 24 | + api_key: "YOUR_API_KEY" # Your API key for label studuio authentication, which can be set when starting the label-studio service |
| 25 | + |
| 26 | + # human preference annotation settings |
| 27 | + prompt_key: "prompt" # Prompt field |
| 28 | + answer1_key: "answer1" # First answer option |
| 29 | + answer2_key: "answer2" # Second answer option |
| 30 | + chosen_key: "chosen" # Chosen field |
| 31 | + rejected_key: "rejected" # Rejected field |
| 32 | + inputs: # the output will be set to the explorer input automatically |
| 33 | + - 'examples/dpo_human_in_the_loop/demo-data.jsonl' |
| 34 | + target_fields: ["prompt"] |
| 35 | +service: |
| 36 | + data_juicer: |
| 37 | + auto_start: true |
| 38 | + |
| 39 | +algorithm: |
| 40 | + algorithm_type: dpo |
| 41 | + kl_loss_fn: k1 |
| 42 | + kl_loss_fn_args: |
| 43 | + kl_coef: 0.1 |
| 44 | +checkpoint_root_dir: /PATH/TO/CHECKPOINT/ |
| 45 | +model: |
| 46 | + model_path: /PATH/TO/MODEL |
| 47 | + max_response_tokens: 1024 |
| 48 | + max_model_len: 1536 |
| 49 | +cluster: |
| 50 | + node_num: 1 |
| 51 | + gpu_per_node: 8 |
| 52 | +buffer: |
| 53 | + total_epochs: 2 |
| 54 | + train_batch_size: 64 |
| 55 | + trainer_input: |
| 56 | + experience_buffer: |
| 57 | + name: dpo_buffer |
| 58 | + storage_type: file |
| 59 | + enable_progress_bar: True |
| 60 | + path: ./outputs/human_annotation_output/ # the result data after human preference annotation are stored here |
| 61 | + format: |
| 62 | + prompt_type: plaintext # plaintext/messages |
| 63 | + prompt_key: prompt |
| 64 | + chosen_key: chosen |
| 65 | + rejected_key: rejected |
| 66 | +synchronizer: |
| 67 | + sync_method: 'checkpoint' |
| 68 | + sync_interval: 30 |
| 69 | + sync_timeout: 1200 |
| 70 | +trainer: |
| 71 | + trainer_type: 'verl' |
| 72 | + trainer_config_path: 'examples/dpo_human_in_the_loop/train_dpo.yaml' |
| 73 | + save_interval: 30 |
0 commit comments