|
| 1 | +--- |
| 2 | +pagetitle: "Examples: Advanced Validation" |
| 3 | +notebook-links: false |
| 4 | +page-navigation: false |
| 5 | +toc: false |
| 6 | +html-table-processing: none |
| 7 | +--- |
| 8 | + |
| 9 | +### Advanced Validation |
| 10 | + |
| 11 | +A validation with a comprehensive set of rules. |
| 12 | + |
| 13 | +```{python} |
| 14 | +#| echo: false |
| 15 | +
|
| 16 | +import pointblank as pb |
| 17 | +import polars as pl |
| 18 | +import narwhals as nw |
| 19 | +
|
| 20 | +validation = ( |
| 21 | + pb.Validate( |
| 22 | + data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), |
| 23 | + tbl_name="game_revenue", |
| 24 | + label="Comprehensive validation example", |
| 25 | + thresholds=pb.Thresholds(warn_at=0.10, stop_at=0.25, notify_at=0.35), |
| 26 | + ) |
| 27 | + .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 |
| 28 | + .col_vals_gt(columns="session_duration", value=5) # STEP 2 |
| 29 | + .col_vals_ge(columns="item_revenue", value=0.02) # STEP 3 |
| 30 | + .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 |
| 31 | + .col_vals_in_set( # STEP 5 |
| 32 | + columns="acquisition", |
| 33 | + set=["google", "facebook", "organic", "crosspromo", "other_campaign"] |
| 34 | + ) |
| 35 | + .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 |
| 36 | + .col_vals_expr(expr=nw.when( # STEP 7 |
| 37 | + nw.col("item_type") == "iap") |
| 38 | + .then(nw.col("item_name").str.contains(r"^[a-z]*?\d$")) |
| 39 | + ) |
| 40 | + .col_vals_between( # STEP 8 |
| 41 | + columns="session_duration", |
| 42 | + left=10, right=50, |
| 43 | + pre = lambda df: df.select(pl.median("session_duration")) |
| 44 | + ) |
| 45 | + .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 9 |
| 46 | + .row_count_match(count=2000) # STEP 10 |
| 47 | + .col_count_match(count=11) # STEP 11 |
| 48 | + .col_vals_not_null(columns=pb.starts_with("item")) # STEP 12-14 |
| 49 | + .col_exists(columns="start_day") # STEP 15 |
| 50 | + .interrogate() |
| 51 | +) |
| 52 | +
|
| 53 | +validation |
| 54 | +``` |
| 55 | + |
| 56 | +```python |
| 57 | +import pointblank as pb |
| 58 | +import polars as pl |
| 59 | +import narwhals as nw |
| 60 | + |
| 61 | +validation = ( |
| 62 | + pb.Validate( |
| 63 | + data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), |
| 64 | + tbl_name="game_revenue", |
| 65 | + label="Comprehensive validation example", |
| 66 | + thresholds=pb.Thresholds(warn_at=0.10, stop_at=0.25, notify_at=0.35), |
| 67 | + ) |
| 68 | + .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 |
| 69 | + .col_vals_gt(columns="session_duration", value=5) # STEP 2 |
| 70 | + .col_vals_ge(columns="item_revenue", value=0.02) # STEP 3 |
| 71 | + .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 |
| 72 | + .col_vals_in_set( # STEP 5 |
| 73 | + columns="acquisition", |
| 74 | + set=["google", "facebook", "organic", "crosspromo", "other_campaign"] |
| 75 | + ) |
| 76 | + .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 |
| 77 | + .col_vals_expr(expr=nw.when( # STEP 7 |
| 78 | + nw.col("item_type") == "iap") |
| 79 | + .then(nw.col("item_name").str.contains(r"^[a-z]*?\d$")) |
| 80 | + ) |
| 81 | + .col_vals_between( # STEP 8 |
| 82 | + columns="session_duration", |
| 83 | + left=10, right=50, |
| 84 | + pre = lambda df: df.select(pl.median("session_duration")) |
| 85 | + ) |
| 86 | + .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 9 |
| 87 | + .row_count_match(count=2000) # STEP 10 |
| 88 | + .col_count_match(count=11) # STEP 11 |
| 89 | + .col_vals_not_null(columns=pb.starts_with("item")) # STEP 12-14 |
| 90 | + .col_exists(columns="start_day") # STEP 15 |
| 91 | + .interrogate() |
| 92 | +) |
| 93 | + |
| 94 | +validation |
| 95 | +``` |
| 96 | + |
| 97 | +<details> |
| 98 | +<summary>Preview of Input Table</summary> |
| 99 | + |
| 100 | +```{python} |
| 101 | +# | echo: false |
| 102 | +pb.preview(pb.load_dataset(dataset="game_revenue"), n_head=10, n_tail=10) |
| 103 | +``` |
| 104 | + |
| 105 | +</details> |
0 commit comments