|
| 1 | +""" |
| 2 | +Loading and Exploring Data |
| 3 | +
|
| 4 | +In real data science projects, you'll work with data stored in files. |
| 5 | +This example shows you how to load data from a CSV file and explore it. |
| 6 | +
|
| 7 | +What you'll learn: |
| 8 | +- How to load data from a CSV file |
| 9 | +- How to view basic information about your dataset |
| 10 | +- How to display the first/last rows |
| 11 | +- How to get summary statistics |
| 12 | +
|
| 13 | +Prerequisites: pandas library (install with: pip install pandas) |
| 14 | +""" |
| 15 | + |
| 16 | +# Import the pandas library - it's the most popular tool for working with data in Python |
| 17 | +# We give it the short name 'pd' so we can type less |
| 18 | +import pandas as pd |
| 19 | + |
| 20 | +print("=" * 70) |
| 21 | +print("Welcome to Data Loading and Exploration!") |
| 22 | +print("=" * 70) |
| 23 | +print() |
| 24 | + |
| 25 | +# Step 1: Load data from a CSV file |
| 26 | +# CSV stands for "Comma-Separated Values" - a common format for storing data |
| 27 | +# We'll use the birds dataset that comes with this repository |
| 28 | +print("📂 Loading data from birds.csv...") |
| 29 | +print() |
| 30 | + |
| 31 | +# Load the data into a DataFrame (think of it as a smart spreadsheet) |
| 32 | +# A DataFrame is pandas' main data structure - it organizes data in rows and columns |
| 33 | +data = pd.read_csv('../data/birds.csv') |
| 34 | + |
| 35 | +print("✅ Data loaded successfully!") |
| 36 | +print() |
| 37 | + |
| 38 | +# Step 2: Get basic information about the dataset |
| 39 | +print("-" * 70) |
| 40 | +print("BASIC DATASET INFORMATION") |
| 41 | +print("-" * 70) |
| 42 | + |
| 43 | +# How many rows and columns do we have? |
| 44 | +num_rows, num_columns = data.shape |
| 45 | +print(f"📊 Dataset size: {num_rows} rows × {num_columns} columns") |
| 46 | +print() |
| 47 | + |
| 48 | +# What are the column names? |
| 49 | +print("📋 Column names:") |
| 50 | +for i, column in enumerate(data.columns, 1): |
| 51 | + print(f" {i}. {column}") |
| 52 | +print() |
| 53 | + |
| 54 | +# Step 3: Look at the first few rows of data |
| 55 | +# This gives us a quick preview of what the data looks like |
| 56 | +print("-" * 70) |
| 57 | +print("FIRST 5 ROWS OF DATA (Preview)") |
| 58 | +print("-" * 70) |
| 59 | +print(data.head()) # head() shows the first 5 rows by default |
| 60 | +print() |
| 61 | + |
| 62 | +# Step 4: Look at the last few rows |
| 63 | +print("-" * 70) |
| 64 | +print("LAST 3 ROWS OF DATA") |
| 65 | +print("-" * 70) |
| 66 | +print(data.tail(3)) # tail(3) shows the last 3 rows |
| 67 | +print() |
| 68 | + |
| 69 | +# Step 5: Get information about data types |
| 70 | +print("-" * 70) |
| 71 | +print("DATA TYPES AND NON-NULL COUNTS") |
| 72 | +print("-" * 70) |
| 73 | +print(data.info()) # Shows column names, data types, and count of non-null values |
| 74 | +print() |
| 75 | + |
| 76 | +# Step 6: Get statistical summary |
| 77 | +print("-" * 70) |
| 78 | +print("STATISTICAL SUMMARY (for numerical columns)") |
| 79 | +print("-" * 70) |
| 80 | +# describe() gives us statistics like mean, std, min, max, etc. |
| 81 | +print(data.describe()) |
| 82 | +print() |
| 83 | + |
| 84 | +# Step 7: Check for missing values |
| 85 | +print("-" * 70) |
| 86 | +print("MISSING VALUES CHECK") |
| 87 | +print("-" * 70) |
| 88 | +missing_values = data.isnull().sum() |
| 89 | +print("Number of missing values per column:") |
| 90 | +print(missing_values) |
| 91 | +print() |
| 92 | + |
| 93 | +if missing_values.sum() == 0: |
| 94 | + print("✅ Great! No missing values found.") |
| 95 | +else: |
| 96 | + print("⚠️ Some columns have missing values. You may need to handle them.") |
| 97 | +print() |
| 98 | + |
| 99 | +# Step 8: Get unique values in a column |
| 100 | +print("-" * 70) |
| 101 | +print("SAMPLE: UNIQUE VALUES") |
| 102 | +print("-" * 70) |
| 103 | +# Let's see what unique values exist in the first column |
| 104 | +first_column = data.columns[0] |
| 105 | +unique_count = data[first_column].nunique() |
| 106 | +print(f"The column '{first_column}' has {unique_count} unique value(s)") |
| 107 | +print() |
| 108 | + |
| 109 | +# Summary |
| 110 | +print("=" * 70) |
| 111 | +print("SUMMARY") |
| 112 | +print("=" * 70) |
| 113 | +print("You've learned how to:") |
| 114 | +print(" ✓ Load data from a CSV file using pandas") |
| 115 | +print(" ✓ Check the size and shape of your dataset") |
| 116 | +print(" ✓ View the first and last rows") |
| 117 | +print(" ✓ Understand data types") |
| 118 | +print(" ✓ Get statistical summaries") |
| 119 | +print(" ✓ Check for missing values") |
| 120 | +print() |
| 121 | +print("Next step: Try loading other CSV files from the data/ folder!") |
| 122 | +print("=" * 70) |
| 123 | + |
| 124 | +# Pro Tips: |
| 125 | +# - Always explore your data before analyzing it |
| 126 | +# - Check for missing values and understand why they might be missing |
| 127 | +# - Look at the data types to ensure they make sense |
| 128 | +# - Use head() and tail() to spot any obvious issues with your data |
0 commit comments