Skip to content

Commit f237476

Browse files
authored
Merge branch 'master' into CGP_datagen_example
2 parents eb94d8f + e664451 commit f237476

File tree

10 files changed

+1653
-658
lines changed

10 files changed

+1653
-658
lines changed

dbldatagen/data_analyzer.py

Lines changed: 329 additions & 272 deletions
Large diffs are not rendered by default.

dbldatagen/data_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1913,7 +1913,7 @@ def scriptMerge(
19131913
result = "\n".join(results)
19141914

19151915
if asHtml:
1916-
result = HtmlUtils.formatCodeAsHtml(results)
1916+
result = HtmlUtils.formatCodeAsHtml(result)
19171917

19181918
return result
19191919

dbldatagen/datasets/dataset_provider.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
This file defines the DatasetProvider class
2121
"""
2222

23+
2324
class DatasetProvider(ABC):
2425
"""
2526
The DatasetProvider class acts as a base class for all dataset providers
@@ -206,7 +207,7 @@ def getTableGenerator(self, sparkSession: SparkSession, *, tableName: str|None=N
206207
raise NotImplementedError("Base data provider does not provide any table generation specifications!")
207208

208209
@abstractmethod
209-
def getAssociatedDataset(self, sparkSession: SparkSession, *, tableName: str|None=None, rows: int=-1, partitions: int=-1,
210+
def getAssociatedDataset(self, sparkSession: SparkSession, *, tableName: str | None=None, rows: int=-1, partitions: int=-1,
210211
**options: dict[str, Any]) -> DataGenerator:
211212
"""
212213
Gets associated datasets that are used in conjunction with the provider datasets.
@@ -240,7 +241,7 @@ def allowed_options(options: list[str]|None =None) -> Callable[[Callable], Calla
240241

241242
def decorator(func: Callable) -> Callable:
242243
@functools.wraps(func)
243-
def wrapper(*args, **kwargs) -> Callable: # noqa: ANN002
244+
def wrapper(*args, **kwargs) -> Callable:
244245
bad_options = [keyword_arg for keyword_arg in kwargs
245246
if keyword_arg not in DEFAULT_OPTIONS and keyword_arg not in options]
246247

dbldatagen/datasets_object.py

Lines changed: 229 additions & 138 deletions
Large diffs are not rendered by default.

dbldatagen/function_builder.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,46 @@
11
# See the License for the specific language governing permissions and
22
# limitations under the License.
33
#
4+
5+
"""
6+
This file defines the `ColumnGeneratorBuilder` class and utility functions
7+
"""
8+
49
import itertools
10+
from typing import Any
511

6-
from pyspark.sql.types import StringType, DateType, TimestampType
12+
from pyspark.sql.types import DataType, DateType, StringType, TimestampType
713

814

915
class ColumnGeneratorBuilder:
10-
""" Helper class to build functional column generators of specific forms"""
16+
"""
17+
Helper class to build functional column generators of specific forms
18+
"""
1119

1220
@classmethod
13-
def _mkList(cls, x):
21+
def _mkList(cls, x: object) -> list:
22+
"""
23+
Makes a list of the supplied object instance if it is not already a list.
24+
25+
:param x: Input object to process
26+
:returns: List containing the supplied object if it is not already a list; otherwise returns the object
1427
"""
15-
Makes a list of the supplied object instance if it is not already a list
16-
:param x: object to process
17-
:returns: Returns list of supplied object if it is not already a list, otherwise simply returns the object"""
1828
return [x] if type(x) is not list else x
1929

2030
@classmethod
21-
def _lastElement(cls, x):
22-
""" Gets the last element, if the object is a list otherwise returns the object itself"""
23-
return x[-1] if type(x) is list else x
31+
def _lastElement(cls, x: object) -> object:
32+
"""
33+
Gets the last element from the supplied object if it is a list.
34+
35+
:param x: Input object
36+
:returns: Last element of the input object if it is a list; otherwise returns the object
37+
"""
38+
return x[-1] if isinstance(x, list) else x
2439

2540
@classmethod
26-
def _mkCdfProbabilities(cls, weights):
27-
""" make cumulative distribution function probabilities for each value in values list
41+
def _mkCdfProbabilities(cls, weights: list[float]) -> list[float]:
42+
"""
43+
Makes cumulative distribution function probabilities for each value in values list.
2844
2945
a cumulative distribution function for discrete values can uses
3046
a table of cumulative probabilities to evaluate different expressions
@@ -46,6 +62,9 @@ def _mkCdfProbabilities(cls, weights):
4662
while datasets of size 10,000 x `number of values` gives a repeated
4763
distribution within 5% of expected distribution.
4864
65+
:param weights: List of weights to compute CDF probabilities for
66+
:returns: List of CDF probabilities
67+
4968
Example code to be generated (pseudo code)::
5069
5170
# given values value1 .. valueN, prob1 to probN
@@ -61,13 +80,12 @@ def _mkCdfProbabilities(cls, weights):
6180
6281
"""
6382
total_weights = sum(weights)
64-
return list(map(lambda x: x / total_weights, itertools.accumulate(weights)))
83+
return [x / total_weights for x in itertools.accumulate(weights)]
6584

6685
@classmethod
67-
def mkExprChoicesFn(cls, values, weights, seed_column, datatype):
68-
""" Create SQL expression to compute the weighted values expression
69-
70-
build an expression of the form::
86+
def mkExprChoicesFn(cls, values: list[Any], weights: list[float], seed_column: str, datatype: DataType) -> str:
87+
"""
88+
Creates a SQL expression to compute a weighted values expression. Builds an expression of the form::
7189
7290
case
7391
when rnd_column <= weight1 then value1
@@ -77,22 +95,22 @@ def mkExprChoicesFn(cls, values, weights, seed_column, datatype):
7795
else valueN
7896
end
7997
80-
based on computed probability distribution for values.
81-
82-
In Python 3.6 onwards, we could use the choices function but this python version is not
83-
guaranteed on all Databricks distributions
98+
The output expression is based on the computed probability distribution for the specified values.
8499
85-
:param values: list of values
86-
:param weights: list of weights
87-
:param seed_column: base column for expression
88-
:param datatype: data type of function return value
100+
In Python 3.6 onwards, we could use the choices function but this python version is not guaranteed on all
101+
Databricks distributions.
89102
103+
:param values: List of values
104+
:param weights: List of weights
105+
:param seed_column: Base column name for expression
106+
:param datatype: Spark `DataType` of the output expression
107+
:returns: SQL expression representing the weighted values
90108
"""
91109
cdf_probs = cls._mkCdfProbabilities(weights)
92110

93111
output = [" CASE "]
94112

95-
conditions = zip(values, cdf_probs)
113+
conditions = zip(values, cdf_probs, strict=False)
96114

97115
for v, cdf in conditions:
98116
# TODO(alex): single quotes needs to be escaped

dbldatagen/html_utils.py

Lines changed: 29 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,40 +6,40 @@
66
This file defines the `HtmlUtils` classes and utility functions
77
"""
88

9-
from .utils import system_time_millis
9+
from dbldatagen.utils import system_time_millis
1010

1111

1212
class HtmlUtils:
13-
""" Utility class for formatting code as HTML and other notebook related formatting
14-
13+
"""
14+
Utility class for formatting code as HTML and other notebook-related formatting.
1515
"""
1616

17-
def __init__(self):
17+
def __init__(self) -> None:
1818
pass
1919

20-
@classmethod
21-
def formatCodeAsHtml(cls, codeText):
22-
""" Formats supplied code as Html suitable for use with notebook ``displayHTML``
23-
24-
:param codeText: Code to be wrapped in html section
25-
:return: Html string
20+
@staticmethod
21+
def formatCodeAsHtml(codeText: str) -> str:
22+
"""
23+
Formats the input code as HTML suitable for use with a notebook's ``displayHTML`` command.
2624
27-
This will wrap the code with a html section using html ``pre`` and ``code`` tags.
25+
This method wraps the input code with an html section using ``pre`` and ``code`` tags. It adds a *Copy Text to
26+
Clipboard* button which allows users to easily copy the code to the clipboard.
2827
29-
It adds a copy text to clipboard button to enable users to easily copy the code to the clipboard.
28+
Code is not reformatted. Supplied code should be preformatted into lines.
3029
31-
It does not reformat code so supplied code should be preformatted into lines.
30+
:param codeText: Input code as a string
31+
:return: Formatted code as an HTML string
3232
3333
.. note::
3434
As the notebook environment uses IFrames in rendering html within ``displayHtml``, it cannot use
3535
the newer ``navigator`` based functionality as this is blocked for cross domain IFrames by default.
3636
3737
"""
38-
ts = system_time_millis()
38+
current_ts = system_time_millis()
3939

40-
formattedCode = f"""
40+
return f"""
4141
<h3>Generated Code</h3>
42-
<div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{ts}">
42+
<div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{current_ts}">
4343
{codeText}
4444
</code></pre></p></br>
4545
</div>
@@ -48,7 +48,7 @@ def formatCodeAsHtml(cls, codeText):
4848
function dbldatagen_copy_code_to_clipboard() {{
4949
try {{
5050
var r = document.createRange();
51-
r.selectNode(document.getElementById("generated_code_{ts}"));
51+
r.selectNode(document.getElementById("generated_code_{current_ts}"));
5252
window.getSelection().removeAllRanges();
5353
window.getSelection().addRange(r);
5454
document.execCommand('copy');
@@ -61,23 +61,20 @@ def formatCodeAsHtml(cls, codeText):
6161
</script>
6262
"""
6363

64-
return formattedCode
65-
66-
@classmethod
67-
def formatTextAsHtml(cls, textContent, title="Output"):
68-
""" Formats supplied text as Html suitable for use with notebook ``displayHTML``
69-
70-
:param textContent: Text to be wrapped in html section
71-
:param title: Title text to be used
72-
:return: Html string
73-
74-
This will wrap the text content with with Html formatting
64+
@staticmethod
65+
def formatTextAsHtml(textContent: str, title: str = "Output") -> str:
66+
"""
67+
Formats the input text as HTML suitable for use with a notebook's ``displayHTML`` command. This wraps the text
68+
content with HTML formatting blocks and adds a section title.
7569
70+
:param textContent: Input text to be wrapped in an HTML section
71+
:param title: Section title (default `"Output"`)
72+
:return: Text section as an HTML string
7673
"""
77-
ts = system_time_millis()
78-
formattedContent = f"""
74+
current_ts = system_time_millis()
75+
return f"""
7976
<h3>{title}</h3>
80-
<div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{ts}">
77+
<div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{current_ts}">
8178
{textContent}
8279
</pre></p></br>
8380
</div>
@@ -86,7 +83,7 @@ def formatTextAsHtml(cls, textContent, title="Output"):
8683
function dbldatagen_copy_to_clipboard() {{
8784
try {{
8885
var r = document.createRange();
89-
r.selectNode(document.getElementById("generated_content_{ts}"));
86+
r.selectNode(document.getElementById("generated_content_{current_ts}"));
9087
window.getSelection().removeAllRanges();
9188
window.getSelection().addRange(r);
9289
document.execCommand('copy');
@@ -98,5 +95,3 @@ def formatTextAsHtml(cls, textContent, title="Output"):
9895
}}
9996
</script>
10097
"""
101-
102-
return formattedContent

0 commit comments

Comments
 (0)