qsv/resources/describegpt_defaults.toml at master · dathere/qsv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
name = "qsv Default Prompt File"
description = "Default prompt file for qsv's describegpt command."
author = "qsv team"
version = "7.0.0"
tokens = 20000
base_url = "https://api.openai.com/v1"
model = "openai/gpt-oss-20b"
timeout = 300
format = "markdown"
# responses will be in this language/dialect, or model default if not set
# Can be set to "traditional" languages like Spanish, German,French, Hindi, Mandarin, etc.
# Or "dialects" like Taglish, Pig Latin, Valley Girl, Pirate, Shakespearean English, etc.
language = ""

### NOTE: The following VARIABLES are available in Mini Jinja templates (use {{ variable }} syntax):
# {{ stats }} - summary statistics in CSV format
# {{ frequency }} - frequency distribution in CSV format
# {{ dictionary }} - data dictionary in JSON format
# {{ json_add }} - if --json option is set, inserts (note leading space)
#    ` (in valid, pretty-printed JSON format, ensuring string values are properly escaped)`
#   otherwise inserts
#    ` (in Markdown format)`
# {{ input_table_name }} - the name of the input CSV file.
#    In DuckDB SQL mode, this is replaced with the read_csv_auto function call,
#     e.g. `read_csv_auto('input.csv')`.
#    In Polars SQL --prompt mode, this is replaced with the table alias, e.g. `_t_1`.
# {{ generated_by_signature }} - Attribution metadata and warning placeholder
# {{ duckdb_version }} - only up to the minor version - e.g "1.3"
# {{ top_n }} - The enum threshold for the frequency command
# {{ num_tags }} - The maximum number of tags to infer when the --tags option is used
# {{ tag_vocab }} - The tag vocabulary file content
# {{ language }} - The language to use for the response. Empty string if not set. Has leading space if set.
# {{ headers }} - CSV headers
# {{ delimiter }} - CSV delimiter character
# {{ sample_file }} - The temporary file containing a random sample of the input CSV file.
#                     Only present if --prompt is set.
# {{ sample_size }} - The number of rows in the random sample.

system_prompt = """
You are an expert library scientist with extensive expertise in Statistics, Data Science and SQL.
You are also an expert on the DCAT-US 3 metadata specification (https://doi-do.github.io/dcat-us/).

When you are asked to generate a Data Dictionary, Description or Tags, use the provided Summary Statistics and
Frequency Distribution to guide your response. They both describe the same Dataset and are joined on the `field` column.

The provided Summary Statistics is a CSV file. Each record contains statistics for each Dataset field.

The provided Frequency Distribution is a CSV file with these columns - `field`, `value`, `count`, `percentage`, `rank`.
For each Dataset field, it lists the top {{ top_n }} most frequent unique values sorted in descending order,
with the special value "Other (N)" indicating "Other" unique values beyond the top {{ top_n }}.
The "N" in "Other (N)" indicates the count of "Other" unique values. The "Other" category has a special rank of 0.

The Frequency Distribution's `rank` column is 1-based and is calculated based on the count of the values, with the
most frequent having a rank of 1. In case of ties, `rank` is calculated based on the "dense" rank-strategy (AKA "1223" ranking).

For Dataset fields with all unique values (i.e. cardinality is equal to the number of records), the Frequency Distribution's
`value` column is the special value "<ALL_UNIQUE>"; `count` - the number of records; `percentage` - 100; and `rank` - 0.
"""

dictionary_prompt = """
Generate{% if language %} {{ language|trim|title }}{% endif %} Labels and Descriptions for ALL fields in the Dataset. Use the Summary Statistics and Frequency Distribution to understand the context and meaning of each field.

{% set headers_list = headers|split(",")|list %}
The Dataset has {{ headers_list|length }} field{{ headers_list|length | pluralize }}:
{%- for header in headers_list %}
  {{ loop.index }}. {{ header | trim }}
{%- endfor %}

For each field, provide:
- Label: a human-friendly label for this field (e.g., "Unique Key", "Created Date", "Agency Name")
- Description: a full description for this field (can be multiple sentences)

Return the results in JSON format where each field name is a key, and the value is an object with "label" and "description" properties:
{% raw %}{
  "field_name_1": {
    "label": "Human-friendly label",
    "description": "Full description of the field"
  },
  "field_name_2": {
    "label": "Another label",
    "description": "Another description"
  }
}{% endraw %}

Let's think step by step, correcting yourself as needed.

---

Summary Statistics (CSV):

{{ stats }}

Frequency Distribution (CSV):

{{ frequency }}
"""

description_prompt = """
Generate a{% if language %} {{ language|trim|title }}{% endif %} Description based on the following Summary Statistics and Frequency Distribution data about the Dataset.

Let's think step by step.

---

Summary Statistics (CSV):

{{ stats }}

Frequency Distribution (CSV):

{{ frequency }}

---

Do not output the summary statistics for each field. Do not output the frequency for each field.
Do not output data about each field individually, but instead output about the dataset as a whole
in one 1-8 sentence description.

After the Description, add a section titled "Notable Characteristics" with a bulleted list of notable
characteristics of the Dataset (e.g. the central tendency and spread of the data, the distribution shape,
anomalies, patterns; if there are any outliers, missing values, duplicates, PII/PHI/PCI data; and
other data quality issues that the User should be aware of).
{% if language %} Make sure both the Description and Notable Characteristics are in this language: {{ language|trim|title }}{% endif %}

Add an Attribution with the placeholder "{{ generated_by_signature }}" at the bottom of the output.

The entire output should be in Markdown format.
"""

tags_prompt = """
A Tag is a keyword or label that categorizes datasets with other, similar datasets.
Using the right Tags makes it easier for others to find and use datasets.

{% if tag_vocab %}
Limit your choices to only {{ num_tags }} unique Tags{{ json_add }} in the following Tag Vocabulary,
in order of relevance, based on the Summary Statistics and Frequency Distribution about the Dataset
provided further below.

The Tag Vocabulary is a CSV with 2 columns: Tag and Description.

Take the Description into account to guide your Tag choices.

Tag Vocabulary (CSV):

{{ tag_vocab }}

{% else %}{# NOTE: If no tag vocabulary is provided, use the default tag generation prompt #}
Choose no more than {{ num_tags }} unique Tags{{ json_add }} about the contents of the Dataset
in descending order of importance (lowercase only and use _ to separate words) based on the
Summary Statistics and Frequency Distribution about the Dataset provided below.
Do not use field names in the tags.

{% if language %} Make sure your tag choices are in this language: {{ language|trim|title }} {% endif %}
{% endif %}

Add an Attribution with the placeholder "{{ generated_by_signature }}" after the Tags. If generating JSON format,
add the Attribution as a separate key at the top level of the JSON object, after the Tags, otherwise add it
at the bottom of the Tags in Markdown format.

Let's think step by step, correcting yourself as needed.

---

Summary Statistics (CSV):

{{ stats }}

Frequency Distribution (CSV):

{{ frequency }}"""

prompt = "What is this dataset about?"

custom_prompt_guidance = """
We need to answer the User's Prompt above.

If the User's Prompt is not about the Dataset, immediately return
'I'm sorry. I'm afraid I can only answer questions about the Dataset.'{% if language %} in this language: {{ language|trim|title }}{% endif %}

If the User's Prompt can be answered by using the Dataset's Summary Statistics and
Frequency Distribution data below, immediately return the answer{% if language %} in this language: {{ language|trim|title }}{% endif %}.

Otherwise, using the Dataset's Summary Statistics, Frequency Distribution and Data Dictionary below,
create a SQL query that can be used to answer the User's Prompt.

{# IMPORTANT: make sure the following three lines are not modified, as the DB-specific guidance is injected dynamically here#}
SQL Query Generation Guidelines:

END SQL Query Generation Guidelines
- Make sure the generated SQL query is valid and has{% if language %} {{ language|trim|title }}{% endif %} comments to explain the query
- Add "-- {{ generated_by_signature }}" at the top of the query

{% if sample_file %}
Here is a {{ sample_size }}-row random sample of the Dataset.
Use this sample to understand the Dataset along with the Data Dictionary,
the Summary Statistics and Frequency Distribution to generate the SQL query:

Sample Data (CSV):

{{ sample_file }}
{% endif %}

Return the SQL query as a SQL code block preceded by a newline.

Let's think step by step, correcting yourself as needed.

---

Summary Statistics (CSV):

{{ stats }}

Frequency Distribution (CSV):

{{ frequency }}

Data Dictionary (JSON):

{{ dictionary }}"""

### DuckDB SQL "One-Shot" Query Generation Guidelines
### This list replaces the "END SQL Query Generation Guidelines" marker in custom_prompt_guidance
duckdb_sql_guidance = """
- Use DuckDB {{ duckdb_version }} syntax
- The input csv has headers and uses {{ delimiter }} as the delimiter
- Column names with spaces and special characters are case-sensitive and should be enclosed in double quotes
- Only use the `read_csv_auto` table function to read the input CSV
- Always explicitly cast columns to the appropriate type as per the Data Dictionary using the `CAST` function before using them in calculations.
- Use the placeholder {{ input_table_name }} for the input csv in the `read_csv_auto` table function call
"""

### Polars SQL "One-Shot" Query Generation Guidelines
### This list replaces the "END SQL Query Generation Guidelines" marker in custom_prompt_guidance
polars_sql_guidance = """
- Use Polars SQL syntax (which is a dialect of PostgreSQL)
- Use the Dataset's Summary Statistics, Frequency Distribution and Data Dictionary data to generate the SQL query
- Use {{ input_table_name }} as the placeholder for the table name to query
- Column names with embedded spaces and special characters are case-sensitive and should be enclosed in double quotes
- Only use SQL functions supported by Polars SQL. Supported functions are listed here -
  https://github.com/pola-rs/polars/blob/45819db8865b9665e9dfab5b3a54752c8be9d599/crates/polars-sql/src/functions.rs#L773-L890
- Always prefix aliases with the "alias_" prefix.
- Always explicitly cast columns to the appropriate type as per the Data Dictionary using the `CAST` function before using them in calculations.
- For date operations, use REGEXP_LIKE as Polars SQL does not have many date functions apart from `date_part` and `strftime`
- Do not use SIMILAR TO and CROSS JOIN in the generated SQL query
- Only use comments with the `--` prefix.
- Do not use comments using the `/*` and `*/` syntax, as Polars SQL does not support it
"""

### Additional DuckDB SQL Few-Shot Learning Examples when --fewshot-examples is set
dd_fewshot_examples = """
## Few-Shot Learning Examples

Here are examples of common query patterns to follow:

### Example 1: Basic Aggregation
**User Question:** "What is the kurtosis of a given numerical column?"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Calculate kurtosis of a given numerical column
SELECT kurtosis(column_name) as kurtosis_value
FROM read_csv_auto({{ input_table_name }});
```

### Example 2: Group By with Aggregation
**User Question:** "Show me the count by category"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Group records by category and count occurrences
SELECT
    category,
    COUNT(*) as count
FROM read_csv_auto({{ input_table_name }})
GROUP BY category
ORDER BY count DESC;
```

### Example 3: Date/Time Analysis
**User Question:** "What are the trends over time?"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Analyze trends by date, assuming there's a date column
SELECT
    DATE(date_column) as date,
    COUNT(*) as daily_count,
    AVG(numeric_column) as avg_value
FROM read_csv_auto({{ input_table_name }})
WHERE date_column IS NOT NULL
GROUP BY DATE(date_column)
ORDER BY date;
```

### Example 4: Top N Analysis
**User Question:** "What are the top 10 most frequent values?"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Find top 10 most frequent values in a column
SELECT
    column_name,
    COUNT(*) as frequency
FROM read_csv_auto({{ input_table_name }})
WHERE column_name IS NOT NULL
GROUP BY column_name
ORDER BY frequency DESC
LIMIT 10;
```

### Example 5: Conditional Analysis
**User Question:** "How many records meet specific criteria?"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Count records that meet specific conditions
SELECT
    COUNT(*) as matching_records,
    COUNT(*) * 100.0 / (SELECT COUNT(*) FROM read_csv_auto({{ input_table_name }})) as percentage
FROM read_csv_auto({{ input_table_name }})
WHERE condition1 = 'value1'
  AND condition2 > 100;
```

**User Question:** "What's the relationship between two columns?"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Calculate relationship between two columns
SELECT corr(column2, column1) as correlation,
       covar_pop(column2, column1) as covariance_population,
       covar_samp(column2, column1) as covariance_sample,
       regr_slope(column2, column1) as regression_slope,
       regr_intercept(column2, column1) as regression_intercept,
       regr_r2(column2, column1) as regression_r_squared,
       regr_count(column2, column1) as regression_count
FROM read_csv_auto({{ input_table_name }});
```

### Example 7: Complex Filtering with Subqueries
**User Question:** "Show records above the average"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Find records above the average value
SELECT *
FROM read_csv_auto({{ input_table_name }})
WHERE numeric_column > (
    SELECT AVG(numeric_column)
    FROM read_csv_auto({{ input_table_name }})
    WHERE numeric_column IS NOT NULL
)
ORDER BY numeric_column DESC;
```

### Example 8: Window Functions for Ranking
**User Question:** "Rank the data by some criteria"
**SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Rank records using window functions
SELECT
    *,
    ROW_NUMBER() OVER (ORDER BY numeric_column DESC) as rank,
    RANK() OVER (ORDER BY numeric_column DESC) as rank_with_ties
FROM read_csv_auto({{ input_table_name }})
ORDER BY rank;
```

### Best Practices:
- Always use `read_csv_auto({{ input_table_name }})` to read the CSV
- Include meaningful column aliases (e.g., `COUNT(*) as total_count`)
- Add WHERE clauses to filter out NULL values when appropriate
- Use ORDER BY for meaningful result ordering
- Include LIMIT for large result sets
- Add comments explaining the query logic
- Use appropriate aggregation functions (COUNT, SUM, AVG, MIN, MAX)
- Consider using CTEs (WITH clauses) for complex queries
- Use CASE statements for conditional logic
- Leverage window functions for ranking and running totals"""

### Additional Polars SQL Few-Shot Learning Examples when --fewshot-examples is set
p_fewshot_examples = """
## Few-Shot Learning Examples

Polars SQL is a SQL dialect patterned after PostgreSQL's syntax,
but with Polars-specific functions and operators.

Here are examples of common Polars SQL query patterns to follow:

### Example 1: Basic Aggregation
**User Question:** "What is the continuous quantile element of a given numerical column?"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Calculate kurtosis of a given numerical column
SELECT QUANTILE_CONT(column_name, 0.30) as column_name_q30
FROM {{ input_table_name }};
```

### Example 2: Group By with Aggregation
**User Question:** "Show me the count by category"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Group records by category and count occurrences
SELECT
    category,
    COUNT(*) as count
FROM {{ input_table_name }}
GROUP BY category
ORDER BY count DESC;
```

### Example 3: Date/Time Analysis
**User Question:** "What are the trends over time?"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Analyze trends by date, assuming there's a date column
SELECT
    date_column,
    COUNT(*) as daily_count,
    AVG(numeric_column) as avg_value
FROM {{ input_table_name }}
WHERE date_column IS NOT NULL
GROUP BY date_part('day', date_column)
ORDER BY date_column;
```

### Example 4: Top N Analysis
**User Question:** "What are the top 10 most frequent values?"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Find top 10 most frequent values in a column
SELECT
    column_name,
    COUNT(*) as frequency
FROM {{ input_table_name }}
WHERE column_name IS NOT NULL
GROUP BY column_name
ORDER BY frequency DESC
LIMIT 10;
```

### Example 5: Conditional Analysis
**User Question:** "How many records meet specific criteria?"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Count records that meet specific conditions
SELECT
    COUNT(*) as matching_records,
    COUNT(*) * 100.0 / (SELECT COUNT(*) FROM {{ input_table_name }}) as percentage
FROM {{ input_table_name }}
WHERE condition1 = 'value1'
  AND condition2 > 100;
```

### Example 6: Cross-Column Analysis
**User Question:** "What's the relationship between two columns?"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Calculate relationship between two columns
SELECT
    corr(column1, column2) as pearson_correlation,
    covar(column1, column2) as covariance
FROM {{ input_table_name }};
```

### Example 7: Complex Filtering with Subqueries
**User Question:** "Show records above the average"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Find records above the average value
SELECT *
FROM {{ input_table_name }}
WHERE numeric_column > (
    SELECT AVG(numeric_column)
    FROM {{ input_table_name }}
    WHERE numeric_column IS NOT NULL
)
ORDER BY numeric_column DESC;
```

### Example 8: Window Functions for Ranking
**User Question:** "Rank the data by some criteria"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Rank records using window functions
SELECT
    *,
    ROW_NUMBER() OVER (ORDER BY numeric_column DESC) as rank,
    RANK() OVER (ORDER BY numeric_column DESC) as rank_with_ties
FROM {{ input_table_name }}
ORDER BY rank;
```

### Example 9: String Operations
**User Question:** "Extract patterns from text columns"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Extract patterns and analyze string data
SELECT
    text_column,
    LENGTH(text_column) as text_length,
    UPPER(text_column) as uppercase_text,
    LOWER(text_column) as lowercase_text,
    SUBSTR(text_column, 1, 10) as first_10_chars,
    TIMESTAMP(text_column, 'yyyy-MM-dd HH:mm:ss') as timestamp_column,
    DATE(text_column, 'yyyy-MM-dd') as date_column,
    REGEXP_LIKE(text_column, 'regex_pattern') as is_regex_pattern_present,
    STRPOS(text_column, 'substring') as substring_position,
    STARTS_WITH(text_column, 'prefix') as starts_with_prefix,
    ENDS_WITH(text_column, 'suffix') as ends_with_suffix,
    REPLACE(text_column, 'old_substring', 'new_substring') as replaced_text,
    CONCAT(text_column, 'concatenated_text') as concatenated_text,
    TRIM(text_column) as trimmed_text,
    LTRIM(text_column) as ltrimmed_text,
    RTRIM(text_column) as rtrimmed_text
FROM {{ input_table_name }}
WHERE text_column IS NOT NULL;
```

### Example 10: Array Operations
**User Question:** "Work with array columns"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Analyze array column data
SELECT
    array_column,
    ARRAY_AGG(array_column ORDER BY array_column) as array_agg_sorted,
    ARRAY_CONTAINS(array_column, 'value') as contains_value,
    ARRAY_GET(array_column, 1) as array_col_at_1,
    ARRAY_LENGTH(array_column) as array_length,
    ARRAY_MAX(array_column) as max_value,
    ARRAY_MIN(array_column) as min_value
FROM {{ input_table_name }}
WHERE array_column IS NOT NULL;
```

### Example 11: Struct Operations
**User Question:** "Access nested struct fields"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Extract and analyze nested struct data
SELECT
    struct_column.field1 as extracted_field1,
    struct_column.field2 as extracted_field2,
    struct_column.nested.field3 as nested_field3
FROM {{ input_table_name }}
WHERE struct_column IS NOT NULL;
```

### Example 12: Pivot Operations
**User Question:** "Create a pivot table"
**Polars SQL Query:**
```sql
-- {{ generated_by_signature }}
-- Create a pivot table from the data
SELECT
    category,
    SUM(CASE WHEN status = 'active' THEN value ELSE 0 END) as active_sum,
    SUM(CASE WHEN status = 'inactive' THEN value ELSE 0 END) as inactive_sum,
    COUNT(CASE WHEN status = 'active' THEN 1 END) as active_count
FROM {{ input_table_name }}
GROUP BY category;
```

### Best Practices for Polars SQL:
- Always use `{{ input_table_name }}` as the placeholder for the table name
- Include meaningful column aliases (e.g., `COUNT(*) as total_count`)
- Add WHERE clauses to filter out NULL values when appropriate
- Use ORDER BY for meaningful result ordering
- Include LIMIT for large result sets
- Add comments explaining the query logic
- Use appropriate aggregation functions (COUNT, SUM, AVG, MIN, MAX, kurtosis, skewness)
- Use CASE statements for conditional logic and pivoting
- Leverage window functions for ranking and running totals
- Take advantage of Polars' optimized array operations
- Use `date_part` for time-based grouping instead of `DATE()` function
- Consider using CTEs (WITH clauses) for complex queries
- Leverage Polars' built-in statistical functions for data analysis"""