diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..330d34c --- /dev/null +++ b/.coveragerc @@ -0,0 +1,22 @@ +[run] +# Only measure coverage for these source directories +source = recipes, syncers + +[report] +# Omit tests, docs, markdown, config, and common non-code files from coverage report +omit = + *.md + *.txt + *.rst + *.yml + *.yaml + *.ini + setup.py + LICENSE + README* + CONTRIBUTING* + .gitignore + .coveragerc + __init__.py + pytest.ini + requirements.txt diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 0000000..072a4d4 --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,36 @@ +name: Run Tests and Upload Coverage + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov + + - name: Run tests with coverage + run: | + pytest --cov=src --cov-report=xml + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: leftkats/DataPytheon diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2b2d8e7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.4 # or your preferred version + hooks: + - id: ruff + args: ["--fix", "--line-length=120"] \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..58f790d --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,43 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in **DataPytheon** a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- Harassment, intimidation, or discrimination in any form +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as physical or electronic addresses, without explicit permission +- Other conduct which could reasonably be considered inappropriate in a professional setting + +## Enforcement Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +## Reporting Guidelines + +If you are subject to or witness unacceptable behavior, please report it by contacting the project maintainers. All reports will be reviewed and investigated promptly and fairly. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders, who will decide on the best course of action. This may include a warning or temporary or permanent ban from the project. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.1, available at https://www.contributor-covenant.org/version/2/1/code_of_conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..8c353bd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,50 @@ +# Contributing to DataPytheon + +Thank you for your interest in contributing! :sparkles: Whether you're a first-timer or an experienced contributor, we welcome your help in making DataPytheon better. + +--- + +## How You Can Help + +- Add new **data recipes** in the `recipes/` folder +- Add new **API syncers** in the `syncers/` folder +- Fix bugs or improve existing scripts +- Improve documentation or add examples +- Write or improve tests in the `tests/` folder + +--- + +## Getting Started + +1. **Fork the repository** +Click the "Fork" button at the top right of the repo page. + +2. **Clone your fork locally** +```bash +git clone https://github.com/your-username/DataPytheon.git +cd DataPytheon +``` +3.Create a new branch for your work +```bash +git checkout -b feature/your-feature-name +``` +4.Create a new branch for your work +``bash +pip install -r requirements.txt +``` +5. Make your changes +Add your script, improve docs, or fix bugs. +6. Test your changes +Run existing tests and/or add new ones. +```bash +pytest +``` +7. Commit and push +```bash +git add . +git commit -m "Add feature: description" +git push origin feature/your-feature-name +``` +8. Open a Pull Request + +Go to your fork on GitHub and click "Compare & pull request". Describe your changes clearly. diff --git a/README.md b/README.md index 83f0366..ce1976b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,47 @@ -# DataPytheon -A mythical collection of ready-to-use Python scripts for fetching, cleaning, and syncing data from public datasets and APIs. Cook data fast with recipes and syncers! +# :snake: DataPytheon + +[![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/) +[![codecov](https://codecov.io/gh/leftkats/DataPytheon/graph/badge.svg?token=C69BFSAR0S)](https://codecov.io/gh/leftkats/DataPytheon) + + +Welcome to **DataPytheon** – a mythical library of **easy-to-use Python scripts** that help you **access, clean, and explore datasets** from both public repositories and live APIs. + +Whether you're a **beginner learning data science**, a **developer prototyping fast**, or an **open-source contributor**, this project gives you plug-and-play tools to handle real-world data with ease. + +--- + +## :bookmark_tabs: What Is This? + +**DataPytheon** is a hybrid repository that offers: + +- **`recipes/`** — Pre-cleaned **static datasets** (like Titanic, Iris, Netflix, etc.) +- **`syncers/`** — Scripts to **fetch real-time data** from public APIs (like exchange rates, crypto prices, weather, etc.) + +All scripts return **ready-to-use Pandas DataFrames**, ideal for quick analysis, learning, or feeding into models. + +Think of it as your **data prep toolbox** — one line of code away from clean, structured data. + +--- + +## :hammer_and_wrench: Who Is It For? + +- :student: **Beginners** in Python, data science, or machine learning +- :computer: **Developers** who want quick dataset access without boilerplate +- :sparkles: **Contributors** looking for a simple and valuable open-source project +- :books: **Educators** who need ready datasets for teaching or assignments + +--- + +## :file_folder: Project Structure + +DataPytheon/ +│ +├── recipes/ # Static datasets +│ └── titanic.py # Example recipe +│ +├── syncers/ # Live/API data scripts +│ └── exchange_rates.py # Example syncer +│ +├── tests/ # Basic unit tests for scripts +│ └── test_titanic.py + diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..460e754 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,18 @@ + +codecov: + require_ci_to_pass: yes + +coverage: + status: + project: + default: + target: 85% + threshold: 2% # Reject PR if coverage drops more than 2% + patch: + default: + threshold: 2% + +comment: + layout: "reach, diff, flags, files" + behavior: default + require_changes: false diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..55055c9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.ruff] +line-length = 120 +target-version = "py310" +select = ["E", "F", "I"] # Error, Pyflakes, and Import sort diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5cfeedd --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +pythonpath = src +addopts = --cov=src --cov-report=term --cov-fail-under=80 +testpaths = tests +python_files = test_*.py +norecursedirs = .git .tox .venv .idea .vscode diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e8548cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +# Core data stack +pandas>=1.5 +numpy>=1.22 +seaborn==0.13.2 + +# API requests +requests>=2.31 + +# Testing +pytest>=7.4 +pytest-cov>=4.1 + +# Linting & formatting +ruff>=0.4.4 + +# Pre-commit hooks +pre-commit>=3.6 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/recipes/__init__.py b/src/recipes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/recipes/titanic.py b/src/recipes/titanic.py new file mode 100644 index 0000000..cc0e8d9 --- /dev/null +++ b/src/recipes/titanic.py @@ -0,0 +1,42 @@ +""" +Titanic Dataset Recipe +---------------------- +Loads and preprocesses the Titanic dataset. +Source: https://www.kaggle.com/c/titanic/data (via seaborn) + +Steps: +- Loads from seaborn (no API key needed) +- Basic cleaning and transformation +- Returns a ready-to-use pandas DataFrame +""" + +import seaborn as sns + + +def load_titanic_data(): + """Loads and cleans the Titanic dataset.""" + + # Load dataset from seaborn + df = sns.load_dataset("titanic") + + # Drop columns with too many missing values or redundant info + df = df.drop(columns=["deck", "embark_town", "alive"]) + + # Drop rows with critical missing values + df = df.dropna(subset=["embarked", "age"]) + + # Fill missing values in 'embarked' with the most common port + df["embarked"] = df["embarked"].fillna("S") + + # Convert categorical columns to category dtype + cat_cols = ["sex", "class", "embarked", "who", "adult_male", "alone"] + for col in cat_cols: + df[col] = df[col].astype("category") + + return df + + +# Example usage +if __name__ == "__main__": + df = load_titanic_data() + print(df.head()) diff --git a/src/syncers/__init__.py b/src/syncers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/syncers/exchange_rates.py b/src/syncers/exchange_rates.py new file mode 100644 index 0000000..3d2262e --- /dev/null +++ b/src/syncers/exchange_rates.py @@ -0,0 +1,48 @@ +""" +Exchange Rates Syncer +---------------------- +Fetches real-time exchange rates using the Frankfurter API. +Source: https://www.frankfurter.app/ + +Steps: +- Calls the public API (no API key required) +- Retrieves exchange rates for a given base currency +- Converts the result into a pandas DataFrame +""" + +import pandas as pd +import requests + + +def fetch_exchange_rates(base_currency="USD"): + """ + Fetches latest exchange rates for the given base currency. + + Parameters: + base_currency (str): ISO 4217 code (e.g., 'USD', 'EUR', 'GBP') + + Returns: + pd.DataFrame: Tidy DataFrame with rates and metadata + """ + url = f"https://api.frankfurter.app/latest?from={base_currency}" + response = requests.get(url) + + if response.status_code != 200: + raise Exception( + f"API request failed with status {response.status_code}: {response.text}" + ) + + data = response.json() + + # Flatten into DataFrame + df = pd.DataFrame(list(data["rates"].items()), columns=["currency", "rate"]) + df["base"] = data["base"] + df["date"] = data["date"] + + return df + + +# Example usage +if __name__ == "__main__": + df = fetch_exchange_rates("USD") + print(df.head()) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/recipes/__init__.py b/tests/recipes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/recipes/test_titanic.py b/tests/recipes/test_titanic.py new file mode 100644 index 0000000..59fd0e7 --- /dev/null +++ b/tests/recipes/test_titanic.py @@ -0,0 +1,31 @@ +import pandas as pd + +from src.recipes.titanic import load_titanic_data + + +def test_load_titanic_data_returns_dataframe(): + df = load_titanic_data() + + # Check that output is a DataFrame + assert isinstance(df, pd.DataFrame), "Expected a pandas DataFrame" + + # Check for expected columns (a subset) + expected_cols = { + "survived", + "pclass", + "sex", + "age", + "sibsp", + "parch", + "fare", + "embarked", + } + missing_cols = expected_cols - set(df.columns) + assert not missing_cols, f"Missing expected columns: {missing_cols}" + + # Check no critical nulls remain in 'age' or 'embarked' + assert df["age"].isnull().sum() == 0, "'age' column should have no nulls" + assert df["embarked"].isnull().sum() == 0, "'embarked' column should have no nulls" + + # Check that some rows exist + assert len(df) > 0, "DataFrame should not be empty" diff --git a/tests/syncers/__init__.py b/tests/syncers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/syncers/test_exchange_rates.py b/tests/syncers/test_exchange_rates.py new file mode 100644 index 0000000..53f877b --- /dev/null +++ b/tests/syncers/test_exchange_rates.py @@ -0,0 +1,25 @@ +import pandas as pd + +from src.syncers.exchange_rates import fetch_exchange_rates + + +def test_fetch_exchange_rates_returns_dataframe(): + df = fetch_exchange_rates("USD") + + # Check return type + assert isinstance(df, pd.DataFrame), "Expected a pandas DataFrame" + + # Check required columns + expected_columns = {"currency", "rate", "base", "date"} + assert expected_columns.issubset(df.columns), ( + f"Missing expected columns: {expected_columns - set(df.columns)}" + ) + + # Check if at least one rate exists + assert len(df) > 0, "Expected at least one exchange rate row" + + # Optional: check types + assert df["currency"].dtype == object, ( + "'currency' should be of type object (string)" + ) + assert pd.api.types.is_numeric_dtype(df["rate"]), "'rate' should be numeric"