Skip to content

Commit 98ef5e0

Browse files
authored
Merge branch 'main' into main
2 parents 1ac02fa + 8707ce0 commit 98ef5e0

29 files changed

+1157
-261
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,18 @@ Top must-join communities for ML:
8383
- [Hex](https://hex.ai/)
8484
- [Apache Superset](https://superset.apache.org/)
8585
- [Evidence](https://evidence.dev)
86+
- [Redash](https://redash.io/)
87+
- [Lightdash](https://lightdash.com/)
8688
- Data Integration
8789
- [Cube](https://cube.dev)
8890
- [Fivetran](https://www.fivetran.com)
8991
- [Airbyte](https://airbyte.io)
9092
- [dlt](https://dlthub.com/)
9193
- [Sling](https://slingdata.io/)
9294
- [Meltano](https://meltano.com/)
95+
- Semantic Layers
96+
- [Cube](https://cube.dev)
97+
- [dbt Semantic Layer](https://www.getdbt.com/product/semantic-layer)
9398
- Modern OLAP
9499
- [Apache Druid](https://druid.apache.org/)
95100
- [ClickHouse](https://clickhouse.com/)
@@ -190,6 +195,9 @@ Here's the mostly comprehensive list of data engineering creators:
190195
| Arnaud Milleker | | [Arnaud Milleker](https://www.linkedin.com/in/arnaudmilleker/) (7k+) | | | |
191196
| Soumil Shah | [Soumil Shah] (https://www.youtube.com/@SoumilShah) (50k) | [Soumil Shah](https://www.linkedin.com/in/shah-soumil/) (8k+) | | | |
192197
| Ananth Packkildurai | | [Ananth Packkildurai](https://www.linkedin.com/in/ananthdurai/) (18k+) | | | |
198+
| Dan Kornas | | | [dankornas](https://www.twitter.com/dankornas) (66k+) | |
199+
| Nitin | https://www.linkedin.com/in/tomernitin29/ |
200+
| Manojkumar Vadivel | | [Manojkumar Vadivel](https://www.linkedin.com/in/manojvsj/) (12k+) |
193201

194202
### Great Podcasts
195203

books.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,5 @@
2929
- [Pandas Cookbook, Third Edition](https://www.amazon.com/Pandas-Cookbook-Practical-scientific-exploratory/dp/1836205872)
3030
- [Data Pipelines Pocket Reference](https://www.oreilly.com/library/view/data-pipelines-pocket/9781492087823/)
3131
- [Stream Processing with Apache Flink](https://www.oreilly.com/library/view/stream-processing-with/9781491974285/)
32-
- [Apache Iceberg The Definitive Guide](https://www.oreilly.com/library/view/apache-iceberg-the/9781098148614/)
32+
- [Apache Iceberg The Definitive Guide](https://www.oreilly.com/library/view/apache-iceberg-the/9781098148614/)
33+
- [Python for Data Analysis, 3E](https://wesmckinney.com/book/)

bootcamp/materials/1-dimensional-data-modeling/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,12 @@ There are two methods to get Postgres running locally.
4545
- For Mac: Follow this **[tutorial](https://daily-dev-tips.com/posts/installing-postgresql-on-a-mac-with-homebrew/)** (Homebrew is really nice for installing on Mac)
4646
- For Windows: Follow this **[tutorial](https://www.sqlshack.com/how-to-install-postgresql-on-windows/)**
4747
2. Run this command after replacing **`<computer-username>`** with your computer's username:
48-
48+
4949
```bash
50-
pg_restore -U <computer-username> postgres data.dump
50+
pg_restore -U <computer-username> -d postgres data.dump
5151
```
52+
53+
If you have any issue, the syntax is `pg_restore -U [username] -d [database_name] -h [host] -p [port] [backup_file]`
5254
5355
3. Set up DataGrip, DBeaver, or your VS Code extension to point at your locally running Postgres instance.
5456
4. Have fun querying!

bootcamp/materials/1-dimensional-data-modeling/example.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ DOCKER_IMAGE=my-postgres-image
1111

1212
PGADMIN_EMAIL=[email protected]
1313
PGADMIN_PASSWORD=postgres
14-
PGADMIN_PORT=5050
14+
PGADMIN_PORT=5050

bootcamp/materials/3-spark-fundamentals/notebooks/Caching.ipynb

Lines changed: 49 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 5,
5+
"execution_count": 3,
66
"id": "e9ae4c8b-4599-4fbb-a545-76b6e3bcb84d",
77
"metadata": {},
88
"outputs": [
@@ -12,52 +12,54 @@
1212
"text": [
1313
"== Physical Plan ==\n",
1414
"AdaptiveSparkPlan isFinalPlan=false\n",
15-
"+- ObjectHashAggregate(keys=[device_id#937, device_type#940], functions=[collect_list(user_id#907, 0, 0)])\n",
16-
" +- ObjectHashAggregate(keys=[device_id#937, device_type#940], functions=[partial_collect_list(user_id#907, 0, 0)])\n",
17-
" +- Project [device_id#937, device_type#940, user_id#907]\n",
18-
" +- SortMergeJoin [device_id#937], [device_id#908], Inner\n",
19-
" :- Sort [device_id#937 ASC NULLS FIRST], false, 0\n",
20-
" : +- Exchange hashpartitioning(device_id#937, 4), ENSURE_REQUIREMENTS, [plan_id=1320]\n",
21-
" : +- Filter isnotnull(device_id#937)\n",
22-
" : +- FileScan csv [device_id#937,device_type#940] Batched: false, DataFilters: [isnotnull(device_id#937)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/devices.csv], PartitionFilters: [], PushedFilters: [IsNotNull(device_id)], ReadSchema: struct<device_id:int,device_type:string>\n",
23-
" +- Sort [device_id#908 ASC NULLS FIRST], false, 0\n",
24-
" +- Exchange hashpartitioning(device_id#908, 4), ENSURE_REQUIREMENTS, [plan_id=1321]\n",
25-
" +- Filter isnotnull(device_id#908)\n",
26-
" +- InMemoryTableScan [user_id#907, device_id#908], [isnotnull(device_id#908)]\n",
27-
" +- InMemoryRelation [user_id#907, device_id#908, event_counts#945L, host_array#946], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
28-
" +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[count(1), collect_list(distinct host#201, 0, 0)])\n",
29-
" +- Exchange hashpartitioning(user_id#198, device_id#199, 4), ENSURE_REQUIREMENTS, [plan_id=1338]\n",
30-
" +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[merge_count(1), partial_collect_list(distinct host#201, 0, 0)])\n",
31-
" +- *(2) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[merge_count(1)])\n",
32-
" +- Exchange hashpartitioning(user_id#198, device_id#199, host#201, 4), ENSURE_REQUIREMENTS, [plan_id=1333]\n",
33-
" +- *(1) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[partial_count(1)])\n",
34-
" +- *(1) Filter isnotnull(user_id#198)\n",
35-
" +- FileScan csv [user_id#198,device_id#199,host#201] Batched: false, DataFilters: [isnotnull(user_id#198)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
15+
"+- ObjectHashAggregate(keys=[device_id#598, device_type#601], functions=[collect_list(user_id#568, 0, 0)])\n",
16+
" +- ObjectHashAggregate(keys=[device_id#598, device_type#601], functions=[partial_collect_list(user_id#568, 0, 0)])\n",
17+
" +- Project [device_id#598, device_type#601, user_id#568]\n",
18+
" +- SortMergeJoin [device_id#598], [device_id#569], Inner\n",
19+
" :- Sort [device_id#598 ASC NULLS FIRST], false, 0\n",
20+
" : +- Exchange hashpartitioning(device_id#598, 4), ENSURE_REQUIREMENTS, [plan_id=735]\n",
21+
" : +- Filter isnotnull(device_id#598)\n",
22+
" : +- FileScan csv [device_id#598,device_type#601] Batched: false, DataFilters: [isnotnull(device_id#598)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/devices.csv], PartitionFilters: [], PushedFilters: [IsNotNull(device_id)], ReadSchema: struct<device_id:int,device_type:string>\n",
23+
" +- Sort [device_id#569 ASC NULLS FIRST], false, 0\n",
24+
" +- Exchange hashpartitioning(device_id#569, 4), ENSURE_REQUIREMENTS, [plan_id=736]\n",
25+
" +- Filter isnotnull(device_id#569)\n",
26+
" +- InMemoryTableScan [user_id#568, device_id#569], [isnotnull(device_id#569)]\n",
27+
" +- InMemoryRelation [user_id#568, device_id#569, event_counts#606L, host_array#607], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
28+
" +- AdaptiveSparkPlan isFinalPlan=false\n",
29+
" +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[count(1), collect_list(distinct host#20, 0, 0)])\n",
30+
" +- Exchange hashpartitioning(user_id#17, device_id#18, 4), ENSURE_REQUIREMENTS, [plan_id=752]\n",
31+
" +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[merge_count(1), partial_collect_list(distinct host#20, 0, 0)])\n",
32+
" +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[merge_count(1)])\n",
33+
" +- Exchange hashpartitioning(user_id#17, device_id#18, host#20, 4), ENSURE_REQUIREMENTS, [plan_id=748]\n",
34+
" +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[partial_count(1)])\n",
35+
" +- Filter isnotnull(user_id#17)\n",
36+
" +- FileScan csv [user_id#17,device_id#18,host#20] Batched: false, DataFilters: [isnotnull(user_id#17)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
3637
"\n",
3738
"\n",
3839
"== Physical Plan ==\n",
3940
"AdaptiveSparkPlan isFinalPlan=false\n",
40-
"+- ObjectHashAggregate(keys=[user_id#907], functions=[max(event_counts#945L), collect_list(device_id#908, 0, 0)])\n",
41-
" +- ObjectHashAggregate(keys=[user_id#907], functions=[partial_max(event_counts#945L), partial_collect_list(device_id#908, 0, 0)])\n",
42-
" +- Project [user_id#907, device_id#908, event_counts#945L]\n",
43-
" +- SortMergeJoin [user_id#907], [user_id#953], Inner\n",
44-
" :- Sort [user_id#907 ASC NULLS FIRST], false, 0\n",
45-
" : +- Exchange hashpartitioning(user_id#907, 4), ENSURE_REQUIREMENTS, [plan_id=1374]\n",
46-
" : +- Filter isnotnull(user_id#907)\n",
47-
" : +- FileScan csv [user_id#907,device_id#908] Batched: false, DataFilters: [isnotnull(user_id#907)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int>\n",
48-
" +- Sort [user_id#953 ASC NULLS FIRST], false, 0\n",
49-
" +- Exchange hashpartitioning(user_id#953, 4), ENSURE_REQUIREMENTS, [plan_id=1375]\n",
50-
" +- Filter isnotnull(user_id#953)\n",
51-
" +- InMemoryTableScan [user_id#953, event_counts#945L], [isnotnull(user_id#953)]\n",
52-
" +- InMemoryRelation [user_id#953, device_id#954, event_counts#945L, host_array#946], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
53-
" +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[count(1), collect_list(distinct host#201, 0, 0)])\n",
54-
" +- Exchange hashpartitioning(user_id#198, device_id#199, 4), ENSURE_REQUIREMENTS, [plan_id=1392]\n",
55-
" +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[merge_count(1), partial_collect_list(distinct host#201, 0, 0)])\n",
56-
" +- *(2) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[merge_count(1)])\n",
57-
" +- Exchange hashpartitioning(user_id#198, device_id#199, host#201, 4), ENSURE_REQUIREMENTS, [plan_id=1387]\n",
58-
" +- *(1) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[partial_count(1)])\n",
59-
" +- *(1) Filter isnotnull(user_id#198)\n",
60-
" +- FileScan csv [user_id#198,device_id#199,host#201] Batched: false, DataFilters: [isnotnull(user_id#198)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
41+
"+- ObjectHashAggregate(keys=[user_id#568], functions=[max(event_counts#606L), collect_list(device_id#569, 0, 0)])\n",
42+
" +- ObjectHashAggregate(keys=[user_id#568], functions=[partial_max(event_counts#606L), partial_collect_list(device_id#569, 0, 0)])\n",
43+
" +- Project [user_id#568, device_id#569, event_counts#606L]\n",
44+
" +- SortMergeJoin [user_id#568], [user_id#614], Inner\n",
45+
" :- Sort [user_id#568 ASC NULLS FIRST], false, 0\n",
46+
" : +- Exchange hashpartitioning(user_id#568, 4), ENSURE_REQUIREMENTS, [plan_id=788]\n",
47+
" : +- Filter isnotnull(user_id#568)\n",
48+
" : +- FileScan csv [user_id#568,device_id#569] Batched: false, DataFilters: [isnotnull(user_id#568)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int>\n",
49+
" +- Sort [user_id#614 ASC NULLS FIRST], false, 0\n",
50+
" +- Exchange hashpartitioning(user_id#614, 4), ENSURE_REQUIREMENTS, [plan_id=789]\n",
51+
" +- Filter isnotnull(user_id#614)\n",
52+
" +- InMemoryTableScan [user_id#614, event_counts#606L], [isnotnull(user_id#614)]\n",
53+
" +- InMemoryRelation [user_id#614, device_id#615, event_counts#606L, host_array#607], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
54+
" +- AdaptiveSparkPlan isFinalPlan=false\n",
55+
" +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[count(1), collect_list(distinct host#20, 0, 0)])\n",
56+
" +- Exchange hashpartitioning(user_id#17, device_id#18, 4), ENSURE_REQUIREMENTS, [plan_id=805]\n",
57+
" +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[merge_count(1), partial_collect_list(distinct host#20, 0, 0)])\n",
58+
" +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[merge_count(1)])\n",
59+
" +- Exchange hashpartitioning(user_id#17, device_id#18, host#20, 4), ENSURE_REQUIREMENTS, [plan_id=801]\n",
60+
" +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[partial_count(1)])\n",
61+
" +- Filter isnotnull(user_id#17)\n",
62+
" +- FileScan csv [user_id#17,device_id#18,host#20] Batched: false, DataFilters: [isnotnull(user_id#17)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
6163
"\n",
6264
"\n"
6365
]
@@ -73,10 +75,10 @@
7375
"eventsAggregated: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: int, device_id: int ... 2 more fields]\n",
7476
"usersAndDevices: org.apache.spark.sql.DataFrame = [user_id: int, user_id: int ... 2 more fields]\n",
7577
"devicesOnEvents: org.apache.spark.sql.DataFrame = [device_id: int, device_type: string ... 3 more fields]\n",
76-
"res4: Array[org.apache.spark.sql.Row] = Array([-2147470439,-2147470439,3,WrappedArray(378988111, 378988111, 378988111)])\n"
78+
"res1: Array[org.apache.spark.sql.Row] = Array([-2147470439,-2147470439,3,WrappedArray(378988111, 378988111, 378988111)])\n"
7779
]
7880
},
79-
"execution_count": 5,
81+
"execution_count": 3,
8082
"metadata": {},
8183
"output_type": "execute_result"
8284
}
@@ -107,6 +109,7 @@
107109
"//Caching here should be < 5 GBs or used for broadcast join\n",
108110
"//You need to tune executor memory otherwise it'll spill to disk and be slow\n",
109111
"//Don't really try using any of the other StorageLevel besides MEMORY_ONLY\n",
112+
"\n",
110113
"val eventsAggregated = spark.sql(f\"\"\"\n",
111114
" SELECT user_id, \n",
112115
" device_id, \n",
@@ -207,4 +210,4 @@
207210
},
208211
"nbformat": 4,
209212
"nbformat_minor": 5
210-
}
213+
}

0 commit comments

Comments
 (0)