diff --git a/.github/workflows/update_data.yml b/.github/workflows/update_data.yml new file mode 100644 index 0000000..4e18620 --- /dev/null +++ b/.github/workflows/update_data.yml @@ -0,0 +1,22 @@ +name: Updata Microsoft Stock Price Data + +on: + schedule: + # weekly + - cron: "0 0 * * 0" + # support manual trigger + workflow_dispatch: + +jobs: + update_data: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Install dependencies + run: pip install -r requirements.txt + - name: Update data + run: python lab_13/update_data.py \ No newline at end of file diff --git a/lab_13/create_table.ipynb b/lab_13/create_table.ipynb new file mode 100644 index 0000000..5e15fce --- /dev/null +++ b/lab_13/create_table.ipynb @@ -0,0 +1,458 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5b781f40", + "metadata": {}, + "outputs": [], + "source": [ + "# and we will store the data in bigquery\n", + "import pandas_gbq\n", + "import pydata_google_auth\n", + "\n", + "import yfinance as yf # for downloading stock data" + ] + }, + { + "cell_type": "markdown", + "id": "84cd426d", + "metadata": {}, + "source": [ + "### Authentication" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8c0234f7", + "metadata": {}, + "outputs": [], + "source": [ + "# BigQuery authentication\n", + "SCOPES = [\n", + " 'https://www.googleapis.com/auth/cloud-platform',\n", + " 'https://www.googleapis.com/auth/drive',\n", + "]\n", + "\n", + "credentials = pydata_google_auth.get_user_credentials(\n", + " SCOPES,\n", + " # Note, this doesn't work if you're running from a notebook on a\n", + " # remote sever, such as over SSH or with Google Colab. In those cases,\n", + " # install the gcloud command line interface and authenticate with the\n", + " # `gcloud auth application-default login` command and the `--no-browser`\n", + " # option.\n", + " auth_local_webserver=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7f461dc3", + "metadata": {}, + "source": [ + "### Getting daily data from last month" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ada8c4db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OpenHighLowCloseVolumeDividendsStock Splits
Date
2025-03-18 00:00:00-04:00387.070007387.369995381.100006383.519989194869000.00.0
2025-03-19 00:00:00-04:00385.529999389.679993384.000000387.820007191855000.00.0
2025-03-20 00:00:00-04:00385.739990391.790009383.279999386.839996184705000.00.0
2025-03-21 00:00:00-04:00383.220001391.739990382.799988391.260010396759000.00.0
2025-03-24 00:00:00-04:00395.399994395.399994389.809998393.079987210045000.00.0
2025-03-25 00:00:00-04:00393.920013396.359985392.640015395.160004157750000.00.0
2025-03-26 00:00:00-04:00395.000000395.309998388.570007389.970001161084000.00.0
2025-03-27 00:00:00-04:00390.130005392.239990387.399994390.579987137668000.00.0
2025-03-28 00:00:00-04:00388.079987389.130005376.929993378.799988216320000.00.0
2025-03-31 00:00:00-04:00372.540009377.070007367.239990375.390015351847000.00.0
2025-04-01 00:00:00-04:00374.649994382.850006373.230011382.190002196895000.00.0
2025-04-02 00:00:00-04:00377.970001385.079987376.619995382.140015160926000.00.0
2025-04-03 00:00:00-04:00374.790009377.480011369.350006373.109985301980000.00.0
2025-04-04 00:00:00-04:00364.130005374.589996359.480011359.839996492099000.00.0
2025-04-07 00:00:00-04:00350.880005371.000000344.790009357.859985504250000.00.0
2025-04-08 00:00:00-04:00368.260010373.649994350.250000354.559998358689000.00.0
2025-04-09 00:00:00-04:00353.540009393.230011353.100006390.489990501997000.00.0
2025-04-10 00:00:00-04:00382.059998383.899994367.799988381.350006380244000.00.0
2025-04-11 00:00:00-04:00380.640015390.049988378.890015388.450012238392000.00.0
2025-04-14 00:00:00-04:00393.220001394.649994384.209991387.809998192512000.00.0
2025-04-15 00:00:00-04:00388.510010391.890015384.160004385.730011171999000.00.0
2025-04-16 00:00:00-04:00380.670013381.609985368.000000371.609985219678000.00.0
2025-04-17 00:00:00-04:00373.440002374.321503366.890015367.779999208589070.00.0
\n", + "
" + ], + "text/plain": [ + " Open High Low Close \\\n", + "Date \n", + "2025-03-18 00:00:00-04:00 387.070007 387.369995 381.100006 383.519989 \n", + "2025-03-19 00:00:00-04:00 385.529999 389.679993 384.000000 387.820007 \n", + "2025-03-20 00:00:00-04:00 385.739990 391.790009 383.279999 386.839996 \n", + "2025-03-21 00:00:00-04:00 383.220001 391.739990 382.799988 391.260010 \n", + "2025-03-24 00:00:00-04:00 395.399994 395.399994 389.809998 393.079987 \n", + "2025-03-25 00:00:00-04:00 393.920013 396.359985 392.640015 395.160004 \n", + "2025-03-26 00:00:00-04:00 395.000000 395.309998 388.570007 389.970001 \n", + "2025-03-27 00:00:00-04:00 390.130005 392.239990 387.399994 390.579987 \n", + "2025-03-28 00:00:00-04:00 388.079987 389.130005 376.929993 378.799988 \n", + "2025-03-31 00:00:00-04:00 372.540009 377.070007 367.239990 375.390015 \n", + "2025-04-01 00:00:00-04:00 374.649994 382.850006 373.230011 382.190002 \n", + "2025-04-02 00:00:00-04:00 377.970001 385.079987 376.619995 382.140015 \n", + "2025-04-03 00:00:00-04:00 374.790009 377.480011 369.350006 373.109985 \n", + "2025-04-04 00:00:00-04:00 364.130005 374.589996 359.480011 359.839996 \n", + "2025-04-07 00:00:00-04:00 350.880005 371.000000 344.790009 357.859985 \n", + "2025-04-08 00:00:00-04:00 368.260010 373.649994 350.250000 354.559998 \n", + "2025-04-09 00:00:00-04:00 353.540009 393.230011 353.100006 390.489990 \n", + "2025-04-10 00:00:00-04:00 382.059998 383.899994 367.799988 381.350006 \n", + "2025-04-11 00:00:00-04:00 380.640015 390.049988 378.890015 388.450012 \n", + "2025-04-14 00:00:00-04:00 393.220001 394.649994 384.209991 387.809998 \n", + "2025-04-15 00:00:00-04:00 388.510010 391.890015 384.160004 385.730011 \n", + "2025-04-16 00:00:00-04:00 380.670013 381.609985 368.000000 371.609985 \n", + "2025-04-17 00:00:00-04:00 373.440002 374.321503 366.890015 367.779999 \n", + "\n", + " Volume Dividends Stock Splits \n", + "Date \n", + "2025-03-18 00:00:00-04:00 19486900 0.0 0.0 \n", + "2025-03-19 00:00:00-04:00 19185500 0.0 0.0 \n", + "2025-03-20 00:00:00-04:00 18470500 0.0 0.0 \n", + "2025-03-21 00:00:00-04:00 39675900 0.0 0.0 \n", + "2025-03-24 00:00:00-04:00 21004500 0.0 0.0 \n", + "2025-03-25 00:00:00-04:00 15775000 0.0 0.0 \n", + "2025-03-26 00:00:00-04:00 16108400 0.0 0.0 \n", + "2025-03-27 00:00:00-04:00 13766800 0.0 0.0 \n", + "2025-03-28 00:00:00-04:00 21632000 0.0 0.0 \n", + "2025-03-31 00:00:00-04:00 35184700 0.0 0.0 \n", + "2025-04-01 00:00:00-04:00 19689500 0.0 0.0 \n", + "2025-04-02 00:00:00-04:00 16092600 0.0 0.0 \n", + "2025-04-03 00:00:00-04:00 30198000 0.0 0.0 \n", + "2025-04-04 00:00:00-04:00 49209900 0.0 0.0 \n", + "2025-04-07 00:00:00-04:00 50425000 0.0 0.0 \n", + "2025-04-08 00:00:00-04:00 35868900 0.0 0.0 \n", + "2025-04-09 00:00:00-04:00 50199700 0.0 0.0 \n", + "2025-04-10 00:00:00-04:00 38024400 0.0 0.0 \n", + "2025-04-11 00:00:00-04:00 23839200 0.0 0.0 \n", + "2025-04-14 00:00:00-04:00 19251200 0.0 0.0 \n", + "2025-04-15 00:00:00-04:00 17199900 0.0 0.0 \n", + "2025-04-16 00:00:00-04:00 21967800 0.0 0.0 \n", + "2025-04-17 00:00:00-04:00 20858907 0.0 0.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dat = yf.Ticker(\"MSFT\")\n", + "msft_df = dat.history(period='1mo')\n", + "msft_df" + ] + }, + { + "cell_type": "markdown", + "id": "e9c0dd1c", + "metadata": {}, + "source": [ + "## Creating BigQuery Table" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "180c6f83", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 874.00it/s]\n" + ] + } + ], + "source": [ + "# to BigQuery\n", + "pandas_gbq.to_gbq(\n", + " msft_df,\n", + " destination_table='stock_data.msft',\n", + " project_id='sipa-adv-c-roberto',\n", + " if_exists='replace',\n", + " credentials=credentials,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab_13/load_data.py b/lab_13/load_data.py new file mode 100644 index 0000000..49ebfe9 --- /dev/null +++ b/lab_13/load_data.py @@ -0,0 +1,63 @@ +# and we will store the data in bigquery +import pandas_gbq +import pydata_google_auth +from google.oauth2 import service_account +import os + +import yfinance as yf # for downloading stock data + +def get_price_data(): + dat = yf.Ticker("MSFT") + msft_df = dat.history(period='1mo') + return msft_df + +def get_bq_credentials(): + # Load the data from BigQuery + SCOPES = [ + 'https://www.googleapis.com/auth/cloud-platform', + 'https://www.googleapis.com/auth/drive', + ] + + # getting the credentials from the environment variable + bq_credentials = os.environ.get('BQ_LAB13') + # as json file + credentials = service_account.Credentials.from_service_account_info( + bq_credentials, + scopes=SCOPES + ) + return credentials + +def get_bq_data(): + + # Load the data from BigQuery into a DataFrame + query = "SELECT * FROM `stock_data.msft`" + + # getting the credentials + credentials = get_bq_credentials() + + df = pandas_gbq.read_gbq(query, project_id='sipa-adv-c-roberto', credentials=credentials) + + return df + +def update_data(): + # get the data from yfinance + msft_df = get_price_data() + # get the data from bigquery + bq_df = get_bq_data() + + # comparing latest date from bq and msft_df + bq_latest_date = bq_df['Date'].max() + msft_latest_date = msft_df.index.max() + # if the latest date from msft_df is greater than bq_latest_date, we add new data to bq + if msft_latest_date > bq_latest_date: + # get the new data from msft_df + new_data = msft_df[msft_df.index > bq_latest_date] + # add the new data to bq + pandas_gbq.to_gbq(new_data, 'stock_data.msft', project_id='sipa-adv-c-roberto', if_exists='append') + print("Data updated") + else: + print("No new data") + +if __name__ == "__main__": + # update the data + update_data() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 112f537..99358d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,7 @@ plotly matplotlib python-dotenv pytest -duckdb \ No newline at end of file +duckdb +google-cloud-bigquery +pandas-gbq +yfinance \ No newline at end of file