|
60 | 60 | spark.sql(f"USE CATALOG {CATALOG_NAME}") |
61 | 61 | spark.sql(f"USE SCHEMA {SCHEMA_NAME}") |
62 | 62 |
|
63 | | -print(f"✅ Generating data in: {CATALOG_NAME}.{SCHEMA_NAME}") |
64 | | -print(f"📊 Total records to generate: {NUM_PRODUCTS + NUM_DISTRIBUTION_CENTERS + NUM_STORES + NUM_ORDERS + NUM_INVENTORY_RECORDS + NUM_SHIPMENTS:,}") |
| 63 | +print(f"Generating data in: {CATALOG_NAME}.{SCHEMA_NAME}") |
| 64 | +print(f"Total records to generate: {NUM_PRODUCTS + NUM_DISTRIBUTION_CENTERS + NUM_STORES + NUM_ORDERS + NUM_INVENTORY_RECORDS + NUM_SHIPMENTS:,}") |
65 | 65 |
|
66 | 66 | # COMMAND ---------- |
67 | 67 |
|
|
127 | 127 | # Write to table |
128 | 128 | df_products.write.mode("overwrite").saveAsTable("products") |
129 | 129 |
|
130 | | -# print(f"✅ Created products table with {df_products.count():,} records") |
131 | 130 | display(df_products.limit(10)) |
132 | 131 |
|
133 | 132 | # COMMAND ---------- |
|
179 | 178 | df_distribution_centers = distribution_center_spec.build() |
180 | 179 | df_distribution_centers.write.mode("overwrite").saveAsTable("distribution_centers") |
181 | 180 |
|
182 | | -print(f"✅ Created distribution_centers table with {df_distribution_centers.count():,} records") |
| 181 | +print(f"Created distribution_centers table with {df_distribution_centers.count():,} records") |
183 | 182 | display(df_distribution_centers.limit(10)) |
184 | 183 |
|
185 | 184 | # COMMAND ---------- |
|
227 | 226 | df_stores = stores_spec.build() |
228 | 227 | df_stores.write.mode("overwrite").saveAsTable("stores") |
229 | 228 |
|
230 | | -print(f"✅ Created stores table with {df_stores.count():,} records") |
231 | | -print(f"🔗 Each store is linked to a distribution_center via distribution_center_id foreign key") |
| 229 | +print(f"Created stores table with {df_stores.count():,} records") |
| 230 | +print(f"Each store is linked to a distribution_center via distribution_center_id foreign key") |
232 | 231 | display(df_stores.limit(10)) |
233 | 232 |
|
234 | 233 | # COMMAND ---------- |
|
335 | 334 |
|
336 | 335 | df_orders.write.mode("overwrite").saveAsTable("orders") |
337 | 336 |
|
338 | | -print(f"✅ Created orders table with {df_orders.count():,} records") |
339 | | -print(f"📊 Status distribution:") |
| 337 | +print(f"Created orders table with {df_orders.count():,} records") |
| 338 | +print(f"Order Status distribution:") |
340 | 339 | df_orders.groupBy("status").count().orderBy("status").show() |
341 | 340 |
|
342 | 341 | # COMMAND ---------- |
|
434 | 433 |
|
435 | 434 | df_inventory.write.mode("overwrite").saveAsTable("inventory") |
436 | 435 |
|
437 | | -print(f"✅ Created inventory table with {df_inventory.count():,} records") |
438 | | -print(f"📊 Location type distribution:") |
| 436 | +print(f"Created inventory table with {df_inventory.count():,} records") |
| 437 | +print(f"Location type distribution:") |
439 | 438 | df_inventory.groupBy("location_type").count().show() |
440 | | -print(f"⚠️ Stockout risk distribution:") |
| 439 | +print(f"Stockout risk distribution:") |
441 | 440 | df_inventory.groupBy("stockout_risk").count().orderBy("stockout_risk").show() |
442 | 441 |
|
443 | 442 | # COMMAND ---------- |
|
570 | 569 |
|
571 | 570 | df_shipments.write.mode("overwrite").saveAsTable("shipments") |
572 | 571 |
|
573 | | -print(f"✅ Created shipments table with {df_shipments.count():,} records") |
574 | | -print(f"🚚 Transport mode distribution:") |
| 572 | +print(f"Created shipments table with {df_shipments.count():,} records") |
| 573 | +print(f"Transport mode distribution:") |
575 | 574 | df_shipments.groupBy("transport_mode").count().orderBy(F.desc("count")).show() |
576 | | -print(f"📦 Shipment status distribution:") |
| 575 | +print(f"Shipment status distribution:") |
577 | 576 | df_shipments.groupBy("status").count().orderBy(F.desc("count")).show() |
578 | 577 | display(df_shipments) |
579 | 578 |
|
|
584 | 583 | # MAGIC |
585 | 584 | # MAGIC This dataset enables the following analytics use cases: |
586 | 585 | # MAGIC |
587 | | -# MAGIC ### 📦 Inventory Optimization |
588 | | -# MAGIC - Multi-echelon inventory visibility across distribution_centers and stores |
| 586 | +# MAGIC ### Inventory Optimization |
589 | 587 | # MAGIC - Stockout risk identification and prediction |
590 | 588 | # MAGIC - Days of supply analysis by product/location |
591 | 589 | # MAGIC - Slow-moving inventory identification |
592 | 590 | # MAGIC |
593 | | -# MAGIC ### 🚚 Logistics & Transportation |
| 591 | +# MAGIC ### Logistics & Transportation |
594 | 592 | # MAGIC - Carrier performance scorecards (OTD%, cost, speed) |
595 | 593 | # MAGIC - Route optimization opportunities |
596 | 594 | # MAGIC - Transport mode analysis (cost vs speed tradeoffs) |
597 | | -# MAGIC - Delay root cause analysis |
598 | 595 | # MAGIC |
599 | | -# MAGIC ### 🏭 Order Planning |
| 596 | +# MAGIC ### Order Planning |
600 | 597 | # MAGIC - Order schedule optimization |
601 | 598 | # MAGIC - Line efficiency tracking |
602 | 599 | # MAGIC - Capacity planning and utilization |
603 | | -# MAGIC - Order-to-inventory flow analysis |
604 | 600 | # MAGIC |
605 | | -# MAGIC ### 📊 Supply Chain Analytics |
| 601 | +# MAGIC ### Supply Chain Analytics |
606 | 602 | # MAGIC - End-to-end supply chain visibility |
607 | 603 | # MAGIC - Network optimization (distribution_center placement, capacity) |
608 | 604 | # MAGIC - Working capital optimization |
609 | | -# MAGIC - Cost-to-serve analysis by region/channel |
610 | 605 | # MAGIC |
611 | | -# MAGIC ### 🤖 AI/ML Use Cases |
| 606 | +# MAGIC ### AI/ML Use Cases |
612 | 607 | # MAGIC - Demand forecasting |
613 | 608 | # MAGIC - Predictive maintenance (production efficiency) |
614 | 609 | # MAGIC - Shipment delay prediction |
615 | | -# MAGIC - Inventory replenishment optimization |
616 | 610 |
|
617 | 611 | # COMMAND ---------- |
618 | 612 |
|
|
696 | 690 |
|
697 | 691 | # COMMAND ---------- |
698 | 692 |
|
699 | | -# MAGIC %md |
700 | | -# MAGIC ## 🎉 Congratulations! |
701 | | -# MAGIC |
702 | | -# MAGIC You've successfully generated a complete CPG supply chain dataset using dbldatagen! |
703 | | -# MAGIC |
704 | | -# MAGIC ### What You've Learned: |
705 | | -# MAGIC ✅ How to install and import dbldatagen |
706 | | -# MAGIC ✅ Basic column generation with different data types |
707 | | -# MAGIC ✅ Creating foreign key relationships |
708 | | -# MAGIC ✅ Weighted categorical distributions |
709 | | -# MAGIC ✅ Date/timestamp generation |
710 | | -# MAGIC ✅ Post-processing with PySpark |
711 | | -# MAGIC ✅ Safe handling of division and NULL values |
712 | | -# MAGIC |
713 | | -# MAGIC ### Your Dataset Includes: |
714 | | -# MAGIC - 500 Products across 7 categories |
715 | | -# MAGIC - 25 Distribution Centers |
716 | | -# MAGIC - 1,000 Retail Stores |
717 | | -# MAGIC - 10,000 Orders |
718 | | -# MAGIC - 50,000 Inventory Records |
719 | | -# MAGIC - 30,000 Shipments |
720 | | -# MAGIC |
721 | | -# MAGIC **Total: 91,525 records ready for analytics!** |
722 | | -# MAGIC |
723 | | -# MAGIC Now go build some amazing dashboards! 📊✨ |
724 | | - |
725 | | -# COMMAND ---------- |
726 | | - |
0 commit comments