Skip to content

Commit 10a5519

Browse files
code cleanup (#73)
* code cleanup * fixed unused local vars
1 parent 15e2f67 commit 10a5519

File tree

8 files changed

+23
-33
lines changed

8 files changed

+23
-33
lines changed

dbldatagen/data_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField
1313

1414
from .column_generation_spec import ColumnGenerationSpec
15-
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME
15+
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME
1616
from .spark_singleton import SparkSingleton
1717
from .utils import ensure, topologicalSort, DataGenError, deprecated
1818

dbldatagen/daterange.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
11
# See the License for the specific language governing permissions and
22
# limitations under the License.
33
#
4-
from datetime import date, datetime, timedelta, timezone
54
import math
6-
7-
from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
8-
StructType, StructField, TimestampType, DataType, DateType, ByteType
5+
from datetime import datetime, timedelta
96

107
from .datarange import DataRange
118
from .utils import parse_time_interval
@@ -104,7 +101,7 @@ def computeDateRange(cls, begin, end, interval, unique_values):
104101
assert type(unique_values) is int, "unique_values must be integer"
105102
assert unique_values >= 1, "unique_values must be positive integer"
106103

107-
effective_begin = effective_end - effective_interval * (unique_values - 1 )
104+
effective_begin = effective_end - effective_interval * (unique_values - 1)
108105

109106
result = DateRange(effective_begin, effective_end, effective_interval)
110107
return result

dbldatagen/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
130130
if deps:
131131
next_pending.append((name, set(deps)))
132132
elif name in provided:
133-
pass
134133
value_emitted |= True
135134
else:
136135
gen.append(name)
@@ -151,6 +150,7 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
151150

152151
PATTERN_NAME_EQUALS_VALUE = re.compile(r"(\w+)\s*\=\s*([0-9]+)")
153152
PATTERN_VALUE_SPACE_NAME = re.compile(r"([0-9]+)\s+(\w+)")
153+
_WEEKS_PER_YEAR = 52
154154

155155

156156
def parse_time_interval(spec):
@@ -202,14 +202,14 @@ def parse_time_interval(spec):
202202
elif time_type in ["milliseconds", "millisecond"]:
203203
milliseconds = time_value
204204

205-
td = delta = timedelta(
205+
delta = timedelta(
206206
days=days,
207207
seconds=seconds,
208208
microseconds=microseconds,
209209
milliseconds=milliseconds,
210210
minutes=minutes,
211211
hours=hours,
212-
weeks=weeks
212+
weeks=weeks + (years * _WEEKS_PER_YEAR)
213213
)
214214

215-
return td
215+
return delta

examples/example1.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from datetime import timedelta, datetime
2-
import math
3-
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
4-
import dbldatagen as dg
2+
53
from pyspark.sql import SparkSession
6-
from pyspark.sql.functions import count, when, isnan, isnull, col, lit, countDistinct
4+
from pyspark.sql.functions import count, when, isnan, col, lit, countDistinct
5+
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
6+
7+
import dbldatagen as dg
78

89
interval = timedelta(days=1, hours=1)
910
start = datetime(2017, 10, 1, 0, 0, 0)
@@ -55,7 +56,6 @@
5556
print("Summary;", analyzer.summarize())
5657

5758

58-
5959
def extended_summary(df):
6060
colnames = [c for c in df.columns]
6161
colnames2 = ["summary"]

examples/example4.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
from datetime import timedelta, datetime
2-
import math
3-
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
4-
# from dbldatagen.data_generator import DataGenerator,ensure
5-
import dbldatagen as dg
6-
from pyspark.conf import SparkConf
2+
73
from pyspark.sql import SparkSession
4+
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
5+
6+
import dbldatagen as dg
87

98
interval = timedelta(days=1, hours=1)
109
start = datetime(2017, 10, 1, 0, 0, 0)

examples/example5.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
from datetime import timedelta, datetime
2-
import math
3-
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
42

5-
import dbldatagen as dg
6-
from pyspark.conf import SparkConf
73
from pyspark.sql import SparkSession
84

5+
import dbldatagen as dg
6+
97
interval = timedelta(days=1, hours=1)
108
start = datetime(2017, 10, 1, 0, 0, 0)
119
end = datetime(2018, 10, 1, 6, 0, 0)

examples/example6.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
from datetime import timedelta, datetime
2-
import math
3-
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
4-
from dbldatagen import DateRange
5-
import dbldatagen as dg
6-
from pyspark.conf import SparkConf
2+
73
from pyspark.sql import SparkSession
84

5+
import dbldatagen as dg
6+
97
interval = timedelta(days=1, hours=1)
108
start = datetime(2017, 10, 1, 0, 0, 0)
119
end = datetime(2018, 10, 1, 6, 0, 0)
@@ -37,7 +35,7 @@
3735
.withColumn("sector_status_desc", "string", dataRange=range(1, 5), prefix='status', random=True)
3836
# withColumn adds specification for new column
3937
.withColumn("rand", "float", expr="floor(rand() * 350) * (86400 + 3600)")
40-
.withColumn("last_sync_dt", "timestamp", dataRange=DateRange(start, end, timedelta(days=1, hours=1)),
38+
.withColumn("last_sync_dt", "timestamp", dataRange=dg.DateRange(start, end, timedelta(days=1, hours=1)),
4139
random=True)
4240
.withColumnSpec("sector_technology_desc", values=["GSM", "UMTS", "LTE", "UNKNOWN"], random=True)
4341
.withColumn("test_cell_flg", "int", values=[0, 1], random=True)

tutorial/2-Basics.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,7 @@
104104
# COMMAND ----------
105105

106106
from datetime import timedelta, datetime
107-
import math
108-
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
109-
# from dbldatagen.data_generator import DataGenerator,ensure
107+
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
110108
import dbldatagen as dg
111109

112110
interval = timedelta(days=1, hours=1)

0 commit comments

Comments
 (0)