curaytah/example_qa_extraction.yaml at main · ltjed/curaytah · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Example configuration for Q&A extraction from text documents
# This demonstrates a two-round workflow:
# Round 1: Extract Q&A pairs from raw text
# Round 2: Add thinking traces to the Q&A pairs

goal: Extract Q&A pairs from text documents and augment with reasoning traces
teacher_model: gemini-2.5-flash
api_type: vertex

generation_config:
  temperature: 0.7
  top_p: 0.9

vertex_config:
  project_id: your-project-id
  location: us-central1
  bucket_name: your-bucket-name

input_data_metadata:
  path: data/your_data_directory/
  chunk_size: 100000
  tokens_per_part: 100000000

data_gen_procedures:
  round_1:
    prompt_name: extract_qa_pairs
    extraction_config: extraction_configs/list_extraction_example.yaml
    has_branching: true  # Creates multiple Q&A pairs from single chunk (1:N transformation)

  round_2:
    prompt_name: add_thinking_traces
    extraction_config: extraction_configs/add_thinking_traces.yaml
    has_branching: false  # Augments each Q&A with thinking (1:1 transformation)
    validation:
      enabled: true
      filter_1: is_valid == True only if the value of column thinking is not null

output:
  output_format: format_templates/chat_format_qa_with_thinking.py