Skip to content

Commit b285157

Browse files
gucciwangCalvin WangJoan Aoanan
authored
Hotfix and version bump (#60)
* list of constraints and better tests and vbump * Slack link update Co-authored-by: Calvin Wang <[email protected]> Co-authored-by: Joan Aoanan <[email protected]>
1 parent 4243d29 commit b285157

File tree

9 files changed

+101
-67
lines changed

9 files changed

+101
-67
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ There are 4 main components of Deequ, and they are:
1818
![](imgs/pydeequ_architecture.jpg)
1919

2020
## 🎉 Announcements 🎉
21+
- With PyDeequ v0.1.8, we now officially support Spark3 ! Just make sure you have an environment variable `SPARK_VERSION` to specify your Spark version!
2122
- We've release a blogpost on integrating PyDeequ onto AWS leveraging services such as AWS Glue, Athena, and SageMaker! Check it out: [Monitor data quality in your data lake using PyDeequ and AWS Glue](https://aws.amazon.com/blogs/big-data/monitor-data-quality-in-your-data-lake-using-pydeequ-and-aws-glue/).
2223
- Check out the [PyDeequ Release Announcement Blogpost](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/) with a tutorial walkthrough the Amazon Reviews dataset!
23-
- Join the PyDeequ community on [PyDeequ Slack](https://join.slack.com/t/pydeequ/shared_invite/zt-qopmmfgm-ajKRyxx0HgCiK50b9JhAFg) to chat with the devs!
24+
- Join the PyDeequ community on [PyDeequ Slack](https://join.slack.com/t/pydeequ/shared_invite/zt-te6bntpu-yaqPy7bhiN8Lu0NxpZs47Q) to chat with the devs!
2425

2526
## Quickstart
2627

pydeequ/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# ANY KIND, either express or implied. See the License for the specific
1313
# language governing permissions and limitations under the License.
1414
"""Placeholder docstrings"""
15-
__version__ = "0.1.5"
15+
__version__ = "1.0.0"
1616

1717
from pyspark.sql import SparkSession
1818

pydeequ/checks.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,19 @@ def __init__(self, spark_session: SparkSession, level: CheckLevel, description:
100100
for constraint in self.constraints:
101101
self.addConstraint(constraint)
102102

103+
def addConstraints(self, constraints: list):
104+
self.constraints.extend(constraints)
105+
for constraint in constraints:
106+
self._Check = constraint._Check
107+
103108
def addConstraint(self, constraint):
104109
"""
105110
Returns a new Check object with the given constraints added to the constraints list.
106111
:param Constraint constraint: new constraint to be added.
107112
:return: new Check object
108113
"""
109-
raise NotImplementedError("Private factory method for other check methods")
114+
self.constraints.append(constraint)
115+
self._Check = constraint._Check
110116

111117
def addFilterableContstraint(self, creationFunc):
112118
"""Adds a constraint that can subsequently be replaced with a filtered version

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pydeequ"
3-
version = "0.1.8"
3+
version = "1.0.0"
44
description = "PyDeequ - Unit Tests for Data"
55
authors = ["Calvin Wang <[email protected]>"]
66
maintainers = ["Calvin Wang <[email protected]>"]

tests/test_analyzers.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def test_ApproxCountDistinct(self):
232232
self.assertEqual(self.ApproxCountDistinct("b"), [Row(value=3)])
233233
self.assertEqual(self.ApproxCountDistinct("c"), [Row(value=2)])
234234

235-
@pytest.mark.skip(reason="@unittest.expectedFailure")
235+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
236236
def test_fail_approxCountDistinct(self):
237237
self.assertEqual(self.ApproxCountDistinct("b"), [Row(value=2)])
238238

@@ -257,7 +257,7 @@ def UniqueValueRatio(self, columns, where=None):
257257
self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
258258
return result_df.select("value").collect()
259259

260-
@pytest.mark.skip(reason="@unittest.expectedFailure")
260+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
261261
def test_fail_approxQuantiles(self):
262262
self.assertEqual(self.ApproxQuantiles("b", [0.2, 0.5, 0.73]), [Row(value=1.5), Row(value=2.0), Row(value=3.0)])
263263

@@ -270,7 +270,7 @@ def test_Completeness(self):
270270
self.assertEqual(self.Completeness("c"), [Row(value=2 / 3)])
271271
self.assertEqual(self.Completeness("a"), [Row(value=1)])
272272

273-
@pytest.mark.skip(reason="@unittest.expectedFailure")
273+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
274274
def test_fail_Completeness(self):
275275
self.assertEqual(self.Completeness("c"), [Row(value=1.0)])
276276

@@ -285,7 +285,7 @@ def test_Correlation(self):
285285
self.assertEqual(self.Correlation("b", "d"), [Row(value=0.0)])
286286
self.assertEqual(self.Correlation("b", "a"), [])
287287

288-
@pytest.mark.skip(reason="@unittest.expectedFailure")
288+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
289289
def test_fail_Correlation(self):
290290
self.assertEqual(self.Correlation("b", "c"), [Row(value=-1.0)])
291291

@@ -294,7 +294,7 @@ def test_CountDistinct(self):
294294
self.assertEqual(self.CountDistinct(["b", "c"]), [Row(value=3.0)])
295295
self.assertEqual(self.CountDistinct(["b", "d"]), [Row(value=3.0)])
296296

297-
@pytest.mark.skip(reason="@unittest.expectedFailure")
297+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
298298
def test_fail_CountDistinct(self):
299299
self.assertEqual(self.CountDistinct("b"), [Row(value=1.0)])
300300

@@ -332,7 +332,7 @@ def test_DataType(self):
332332
],
333333
)
334334

335-
@pytest.mark.skip(reason="@unittest.expectedFailure")
335+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
336336
def test_fail_Datatype(self):
337337
self.assertEqual(
338338
self.Datatype("c"),
@@ -356,7 +356,7 @@ def test_Distinctness(self):
356356
self.assertEqual(self.Distinctness(["b", "c"]), [Row(value=1.0)])
357357
self.assertEqual(self.Distinctness(["b", "d"]), [Row(value=1.0)])
358358

359-
@pytest.mark.skip(reason="@unittest.expectedFailure")
359+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
360360
def test_fail_Distinctness(self):
361361
self.assertEqual(self.Distinctness("b"), [Row(value=0)])
362362

@@ -365,7 +365,7 @@ def test_Entropy(self):
365365
self.assertEqual(self.Entropy("a"), [Row(value=1.0986122886681096)])
366366
self.assertEqual(self.Entropy("c"), [Row(value=0.6931471805599453)])
367367

368-
@pytest.mark.skip(reason="@unittest.expectedFailure")
368+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
369369
def test_fail_Entropy(self):
370370
self.assertEqual(self.Entropy("b"), [Row(value=0)])
371371

@@ -395,7 +395,7 @@ def test_Histogram(self):
395395
],
396396
)
397397

398-
@pytest.mark.skip(reason="@unittest.expectedFailure")
398+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
399399
def test_fail_Histogram(self):
400400
self.assertEqual(
401401
self.Histogram("b"),
@@ -432,7 +432,7 @@ def test_Histogram_maxBins(self):
432432
],
433433
)
434434

435-
@pytest.mark.skip(reason="@unittest.expectedFailure")
435+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
436436
def test_fail_Histogram_maxBins(self):
437437
self.assertEqual(
438438
self.Histogram_maxBins("b"),
@@ -450,46 +450,46 @@ def test_fail_Histogram_maxBins(self):
450450
def test_Maximum(self):
451451
self.assertEqual(self.Maximum("b"), [Row(value=3.0)])
452452

453-
@pytest.mark.skip(reason="@unittest.expectedFailure")
453+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
454454
def test_fail_Maximum(self):
455455
self.assertEqual(self.Maximum("c"), [Row(value=3.0)])
456456

457457
def test_MaxLength(self):
458458
self.assertEqual(self.MaxLength("a"), [Row(value=3.0)])
459459

460-
@pytest.mark.skip(reason="@unittest.expectedFailure")
460+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
461461
def test_fail_MaxLength(self):
462462
self.assertEqual(self.MaxLength("b"), [Row(value=3.0)])
463463

464464
def test_Mean(self):
465465
self.assertEqual(self.Mean("b"), [Row(value=2.0)])
466466
self.assertEqual(self.Mean("c"), [Row(value=11 / 2)])
467467

468-
@pytest.mark.skip(reason="@unittest.expectedFailure")
468+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
469469
def test_fail_Mean(self):
470470
self.assertEqual(self.Mean("b"), [Row(value=3.0)])
471471

472472
def test_Minimum(self):
473473
self.assertEqual(self.Minimum("b"), [Row(value=1.0)])
474474
self.assertEqual(self.Minimum("c"), [Row(value=5.0)])
475475

476-
@pytest.mark.skip(reason="@unittest.expectedFailure")
476+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
477477
def test_fail_Minimum(self):
478478
self.assertEqual(self.Minimum("a"), [Row(value=3.0)])
479479
self.assertEqual(self.Minimum("b"), [Row(value=3.0)])
480480

481481
def test_MinLength(self):
482482
self.assertEqual(self.MinLength("a"), [Row(value=3.0)])
483483

484-
@pytest.mark.skip(reason="@unittest.expectedFailure")
484+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
485485
def test_fail_MinLength(self):
486486
self.assertEqual(self.MinLength("a"), [])
487487

488488
def test_MutualInformation(self):
489489
self.assertEqual(self.MutualInformation(["b", "c"]), [Row(value=0.7324081924454064)])
490490
self.assertEqual(self.MutualInformation(["b", "d"]), [Row(value=0.6365141682948128)])
491491

492-
@pytest.mark.skip(reason="@unittest.expectedFailure")
492+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
493493
def test_fail_MutualInformation(self):
494494
self.assertEqual(self.MutualInformation(["b", "d"]), [])
495495

@@ -511,7 +511,7 @@ def test_Size(self):
511511
result_df_row = result_df.select("value").collect()
512512
self.assertEqual(result_df_row, [Row(value=3.0)])
513513

514-
@pytest.mark.skip(reason="@unittest.expectedFailure")
514+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
515515
def test_fail_Size(self):
516516
result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run()
517517
result_df = result.select("value").collect()
@@ -521,15 +521,15 @@ def test_StandardDeviation(self):
521521
self.assertEqual(self.StandardDeviation("b"), [Row(value=0.816496580927726)])
522522
self.assertEqual(self.StandardDeviation("c"), [Row(value=0.5)])
523523

524-
@pytest.mark.skip(reason="@unittest.expectedFailure")
524+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
525525
def test_fail_StandardDeviation(self):
526526
self.assertEqual(self.StandardDeviation("c"), [Row(value=0.8)])
527527

528528
def test_Sum(self):
529529
self.assertEqual(self.Sum("b"), [Row(value=6.0)])
530530
self.assertEqual(self.Sum("c"), [Row(value=11.0)])
531531

532-
@pytest.mark.skip(reason="@unittest.expectedFailure")
532+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
533533
def test_fail_Sum(self):
534534
self.assertEqual(self.Sum("b"), [Row(value=3.0)])
535535

@@ -538,15 +538,15 @@ def test_Uniqueness(self):
538538
self.assertEqual(self.Uniqueness(["b", "d"]), [Row(value=1.0)])
539539
self.assertEqual(self.Uniqueness(["a", "a"]), [Row(value=1.0)])
540540

541-
@pytest.mark.skip(reason="@unittest.expectedFailure")
541+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
542542
def test_fail_Uniqueness(self):
543543
self.assertEqual(self.Uniqueness(["a", "a"]), [])
544544

545545
def test_UniqueValueRatio(self):
546546
self.assertEqual(self.UniqueValueRatio(["b", "d"]), [Row(value=1.0)])
547547
self.assertEqual(self.UniqueValueRatio(["b"]), [Row(value=1.0)])
548548

549-
@pytest.mark.skip(reason="@unittest.expectedFailure")
549+
@pytest.mark.xfail(reason="@unittest.expectedFailure")
550550
def test_fail_UniqueValueRatio(self):
551551
self.assertEqual(self.UniqueValueRatio(["a", "a"]), [])
552552

tests/test_anomaly_detection.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ def HoltWinters(self, analyzer_func, test, df_prev, df_curr=None):
302302
# TODO - Failing bcoz of
303303
# can not implement breeze.stats.DescriptiveStats, because it is not an interface
304304
# (breeze.stats.DescriptiveStats is in unnamed module of loader 'app')
305-
@pytest.mark.skip(reason="TODO: breeze.stats.DescriptiveStats is in unnamed module of loader 'app'")
305+
@pytest.mark.xfail(reason="TODO: breeze.stats.DescriptiveStats is in unnamed module of loader 'app'")
306306
def test_BatchNormalStrategy(self):
307307

308308
# Interval is inclusive, so meet the requirements upper value is up to 9
@@ -363,7 +363,7 @@ def test_OnlineNormalStrategy(self):
363363
# TODO - Fix in deequ - Failing bcoz of
364364
# can not implement breeze.stats.DescriptiveStats, because it is not an interface
365365
# (breeze.stats.DescriptiveStats is in unnamed module of loader 'app')
366-
@pytest.mark.skip(reason="TODO: breeze.stats.DescriptiveStats is in unnamed module of loader 'app'")
366+
@pytest.mark.xfail(reason="TODO: breeze.stats.DescriptiveStats is in unnamed module of loader 'app'")
367367
def test_holtWinters(self):
368368

369369
# must have 15 points of data
@@ -482,7 +482,7 @@ def get_anomalyDetector(self, anomaly):
482482
AnomalyDetector._set_jvm(self._jvm, strategy_jvm)
483483
return AnomalyDetector._anomaly_jvm
484484

485-
@unittest.skip("Not implemented yet!")
485+
@pytest.mark.skip("Not implemented yet!")
486486
def test_anomalyDetector(self):
487487
self.get_anomalyDetector(SimpleThresholdStrategy(1.0, 3.0))
488488

0 commit comments

Comments
 (0)