diff --git a/Beta-L/Influencer_dashboard_design/amazonbar.png b/Beta-L/Influencer_dashboard_design/amazonbar.png new file mode 100644 index 0000000..1f7c1d9 Binary files /dev/null and b/Beta-L/Influencer_dashboard_design/amazonbar.png differ diff --git a/Beta-L/Influencer_dashboard_design/app.py b/Beta-L/Influencer_dashboard_design/app.py new file mode 100644 index 0000000..dfb4c33 --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/app.py @@ -0,0 +1,257 @@ +from flask import Flask, render_template, request, redirect, url_for +import plotly.express as px +import pandas as pd +import mysql.connector + +app = Flask(__name__) + + +# MySQL database connection parameters +db_params = { + "host": "sebi-hackathon.mysql.database.azure.com", + "user": "mysql", + "password": "Betateam-L", + "database": "fintech-influencers-claims" +} + +# Create a MySQL database connection +db_connection = mysql.connector.connect(**db_params) +db_cursor = db_connection.cursor() + +# Fetch data from the table +query = "SELECT * FROM fintech_influencers" +db_cursor.execute(query) +result = db_cursor.fetchall() + +# Define column names +columns = [desc[0] for desc in db_cursor.description] + +# Create a DataFrame from the fetched data +df = pd.DataFrame(result, columns=columns) + +# Load your dataset (replace 'your_dataset.csv' with your actual dataset file) +#df = pd.read_excel('indummm.xlsx') + +df['Flagged Claims'] = '' + +# Dictionary to store previous feedback +previous_feedback = {} +previous_ratings = {} + +@app.route('/') +def index(): + filtered_data = df # Initialize with all data + + keywords = request.args.get('keywords', '').split(',') + filter_option = request.args.get('filter', 'both') # Default to 'all' filter + + # Apply filters based on the selected filter option + if filter_option == 'claims': + filtered_data = df[df['Claim'].str.contains('|'.join(keywords))] + elif filter_option == 'influencers': + filtered_data = df[df['Name'].str.contains('|'.join(keywords))] + + elif filter_option == 'keywords': + filtered_data = df[df['Keywords'].str.contains('|'.join(keywords))] + + elif filter_option == 'both': + filtered_data = df[ + (df['Claim'].str.contains('|'.join(keywords))) | + (df['Name'].str.contains('|'.join(keywords))) | + (df['Keywords'].str.contains('|'.join(keywords))) + ] + + sorted_data = sorted(filtered_data.to_dict('records'), key=lambda x: x['Credibility_Score'], reverse=True) + cards_per_row = 3 + rows = [sorted_data[i:i+cards_per_row] for i in range(0, len(sorted_data), cards_per_row)] + return render_template('index.html', rows=rows) + + + +@app.route('/profile/') +def profile(influencer_name): + filtered_data = df[df['Name'] == influencer_name] + + if not filtered_data.empty: + influencer = filtered_data.iloc[0] + influencer_name = influencer['Name'] + influencer_previous_feedback = previous_feedback.get(influencer_name, []) + influencer_previous_ratings = previous_ratings.get(influencer_name, []) + + return render_template( + 'profile.html', + influencer=influencer, + previous_feedback=influencer_previous_feedback, + previous_ratings=influencer_previous_ratings + ) + else: + return "Influencer not found." + + +@app.route('/visualize') +def visualize(): + return render_template('visualize.html') + +@app.route('/visualize/credibility_bar') +def visualize_credential_bar(): + fig = px.bar(df, x='Name', y='Credibility_Score', title='Credibility Score Distribution') + return fig.to_html() + +@app.route('/visualize/claim_pie') +def visualize_claim_pie(): + claim_counts = df['Claim'].value_counts() + fig = px.pie(claim_counts, names=claim_counts.index, values=claim_counts.values, title='Pie Chart for Claim Categories') + return fig.to_html() + +@app.route('/flag/') +def flag(index): + df.at[index, 'Flagged'] = True # Assuming you have a 'Flagged' column in your DataFrame + return redirect(url_for('index')) + +@app.route('/feedback/', methods=['GET', 'POST']) +def feedback(influencer_name): + influencer = df[df['Name'] == influencer_name].iloc[0] + + if request.method == 'POST': + feedback = request.form.get('feedback') + + # Store previous feedback in the dictionary + if influencer_name in previous_feedback: + previous_feedback[influencer_name].append(feedback) + else: + previous_feedback[influencer_name] = [feedback] + + # Update the DataFrame with the latest feedback + influencer_index = influencer.index[0] # Get the index of the influencer + df.at[influencer_index, 'Feedback'] = feedback + + return redirect(url_for('profile', influencer_name=influencer_name)) + + influencer_previous_feedback = previous_feedback.get(influencer_name, []) + + return render_template('feedback_form.html', influencer=influencer, previous_feedback=influencer_previous_feedback) + + + +@app.route('/flag_claim//', methods=['POST']) +def flag_claim(influencer_name, claim): + influencer = df[df['Name'] == influencer_name].iloc[0] + + if not isinstance(influencer['Flagged Claims'], list): + influencer['Flagged Claims'] = [] + + if claim not in influencer['Flagged Claims']: + influencer['Flagged Claims'].append(claim) + + # You can perform other flagging actions here if needed + # For example, you might want to update a database or a log + + return redirect(url_for('profile', influencer_name=influencer_name)) + + + + + + + + +@app.route('/rate/', methods=['GET', 'POST']) +def rate(influencer_name): + influencer = df[df['Name'] == influencer_name].iloc[0] + + if request.method == 'POST': + rating = int(request.form.get('rating')) + + # Store previous ratings in the dictionary + if influencer_name in previous_ratings: + previous_ratings[influencer_name].append(rating) + else: + previous_ratings[influencer_name] = [rating] + + # Update the DataFrame with the latest rating + influencer_index = influencer.index[0] # Get the index of the influencer + df.at[influencer_index, 'Rating'] = rating + return redirect(url_for('profile', influencer_name=influencer_name)) + + influencer_previous_ratings = previous_ratings.get(influencer_name, []) + + return render_template('rating_form.html', influencer=influencer, previous_ratings=influencer_previous_ratings) + + +@app.route('/compare', methods=['GET', 'POST']) +def compare(): + influencers_list = df['Name'].tolist() + + if request.method == 'POST': + selected_influencers = request.form.getlist('influencers') + + if len(selected_influencers) <= 3: + selected_data = df[df['Name'].isin(selected_influencers)] + + # Create the Credibility Score graph + credibility_graph = px.bar(selected_data, x='Name', y='Credibility_Score', title='Comparison of Credibility Scores') + + # Create the Rating graph + rating_graph = px.bar(selected_data, x='Name', y='Rating', title='Comparison of Ratings') + + # Convert graphs to HTML + credibility_graph = credibility_graph.to_html(full_html=False, include_plotlyjs='cdn') + rating_graph = rating_graph.to_html(full_html=False, include_plotlyjs='cdn') + + return render_template( + 'compare.html', + influencers=influencers_list, + credibility_graph=credibility_graph, + rating_graph=rating_graph + ) + + else: + return "Please select up to 3 influencers for comparison." + + return render_template('compare.html', influencers=influencers_list, credibility_graph='', rating_graph='') + +"""def compare(): + df = pd.DataFrame(result, columns=columns) + influencers_list = df['Name'].tolist() + + if request.method == 'POST': + selected_influencers = request.form.getlist('influencers') + print(selected_influencers) + + if len(selected_influencers) <= 3: + selected_data = df[df['Name'].isin(selected_influencers)] + print(selected_data) + # Create the Credibility Score graph + credibility_graph = px.bar(selected_data, x='Name', y='Credibility_Score', title='Comparison of Credibility Scores') + credibility_graph = credibility_graph.to_html(full_html=False, include_plotlyjs='cdn') + + # Create the Rating graph + rating_graph = px.bar(selected_data, x='Name', y='Rating', title='Comparison of Ratings') + rating_graph = rating_graph.to_html(full_html=False, include_plotlyjs='cdn') + + return render_template( + 'compare.html', + influencers=influencers_list, + credibility_graph=credibility_graph, + rating_graph=rating_graph + ) + else: + return "Please select up to 3 influencers for comparison." + + return render_template('compare.html', influencers=influencers_list, credibility_graph='', rating_graph='') """ + + + +@app.route('/visualize/keyword_bar') +def visualize_keyword_bar(): + keyword_freq = df['Keywords'].str.split(', ').explode().value_counts() + fig = px.bar(x=keyword_freq.index, y=keyword_freq.values, title='Keyword Frequency Bar Chart') + return fig.to_html() + +@app.route('/visualize/scatter_credential_keywords') +def visualize_scatter_credential_keywords(): + fig = px.scatter(df, x='Credibility_Score', y='Keywords', title='Scatter Plot: Credibility vs. Keywords') + return fig.to_html() + +if __name__ == '__main__': + app.run(debug=True) diff --git a/Beta-L/Influencer_dashboard_design/compare.html b/Beta-L/Influencer_dashboard_design/compare.html new file mode 100644 index 0000000..9de2a2b --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/compare.html @@ -0,0 +1,37 @@ + + + + Compare Influencers + + + +
+

Compare Influencers

+
+ +
+

Select up to 3 influencers for comparison:

+
+ + +
+
+ + {% if credibility_graph %} +
+
+

Comparison of Credibility Scores

+ {{ credibility_graph|safe }} +
+
+

Comparison of Ratings

+ {{ rating_graph|safe }} +
+
+ {% endif %} + + diff --git a/Beta-L/Influencer_dashboard_design/feedback_form.html b/Beta-L/Influencer_dashboard_design/feedback_form.html new file mode 100644 index 0000000..9e95256 --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/feedback_form.html @@ -0,0 +1,24 @@ + + + + Feedback + + +

Feedback for Influencer: {{ influencer.Name }}

+
+
+
+ +
+ +
+

Previous Feedback:

+
    + {% for feedback_entry in previous_feedback %} +
  • {{ feedback_entry }}
  • + {% endfor %} +
+
+ + + diff --git a/Beta-L/Influencer_dashboard_design/index.html b/Beta-L/Influencer_dashboard_design/index.html new file mode 100644 index 0000000..2b1dbeb --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/index.html @@ -0,0 +1,99 @@ + + + + Influencer Dashboard + + + + +
+

Influencer Dashboard

+
+ +
+

Get insights from Visualizations:

+ + +
+ +
+ +
+
+ +
+ + + + + + + + + + diff --git a/Beta-L/Influencer_dashboard_design/profile.html b/Beta-L/Influencer_dashboard_design/profile.html new file mode 100644 index 0000000..11e6580 --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/profile.html @@ -0,0 +1,204 @@ + + + + + + + + + + +
+
+

Name: {{ influencer.Name }}

+
+

Credibility Score: {{ influencer['Credibility_Score'] }}

+
+

Explanation: {{ influencer.Explanation }}

+
+

Claim: {{ influencer.Claim }}

+
+

Keywords: {{ influencer.Keywords }}

+
+

Rating: {{ influencer.Rating }}

+
+ + +

+ Social Network Links: + {% if influencer['Social_Network_Links'] %} +

    + {% for link in influencer['Social_Network_Links'].split(',') %} +
  • {{ link }}
  • + {% endfor %} +
+ {% else %} + No social network links available. + {% endif %} +

+ +
+ +

+ Recent Post Links: + {% if influencer['Recent_Post_Links'] %} +

    + {% for link in influencer['Recent_Post_Links'].split(',') %} +
  • {{ link }}
  • + {% endfor %} +
+ {% else %} + No recent post links available. + {% endif %} +

+
+ +
    +

    click here to flag a misleading claim :

    + {% for claim in influencer.Claim.split(', ') %} +
  • + {{ claim }} + +
  • + {% endfor %} +
+ + + + + +
+

Previous Feedback:

+
    + {% for feedback_entry in previous_feedback %} +
  • {{ feedback_entry }}
  • + {% endfor %} +
+
+ + +
+

Rate Influencer:

+
+
+
+ +
+
+ + +
+

Previous Ratings:

+
    + {% for rating in previous_ratings %} +
  • {{ rating }}
  • + {% endfor %} +
+
+ + + + + + +
+ + + + diff --git a/Beta-L/Influencer_dashboard_design/rating_form.html b/Beta-L/Influencer_dashboard_design/rating_form.html new file mode 100644 index 0000000..05d0d56 --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/rating_form.html @@ -0,0 +1,24 @@ + + + + Rating + + +

Rate Influencer: {{ influencer.Name }}

+
+
+
+ +
+ +
+

Previous Ratings:

+
    + {% for rating in previous_ratings %} +
  • {{ rating }}
  • + {% endfor %} +
+
+ + + diff --git a/Beta-L/Influencer_dashboard_design/styles.css b/Beta-L/Influencer_dashboard_design/styles.css new file mode 100644 index 0000000..d9659e6 --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/styles.css @@ -0,0 +1,288 @@ +/* Reset some default styles */ +body, h1, h2, h3, h4, h5, h6, p, ul, ol, li { + margin: 0; + padding: 0; +} + +body { + font-family: Arial, sans-serif; + background-color: #f5f5f5; + color: #333; +} + +.container { + max-width: 1200px; + margin: 0 auto; + padding: 20px; +} + +.header { + background-color: #4cff00; + color: #fff; + text-align: center; + padding: 20px; + margin-bottom: 20px; + border-radius: 10px; +} + +.action-links { + padding-top: 10px; + background-color: #f9f9f9; + border-bottom: 1px solid #ddd; + margin-bottom: 20px; + border-radius: 10px; +} + + .action-links a { + margin-right: 10px; + color: #3498db; + text-decoration: none; + transition: color 0.3s; + } + + .action-links a:hover { + color: #217dbb; + } + +.visualize-links ul { + list-style: none; + padding: 0; +} + +.visualize-links li { + margin-bottom: 10px; +} + +.visualize-links a { + color: #ff6a00; + text-decoration: none; + transition: color 0.3s; +} + + .visualize-links a:hover { + color: #217dbb; + } + +/* Apply Filters Form */ +form { + margin-bottom: 20px; +} + + form label { + margin-right: 10px; + } + +/* Apply Filters Button */ +button[type="submit"] { + background-color: #3498db; + color: #fff; + border: none; + padding: 10px 20px; + border-radius: 5px; + cursor: pointer; + transition: background-color 0.3s, color 0.3s; +} + + q button[type="submit"]:hover { + background-color: #217dbb; + } + +/* Dashboard Styling */ +.dashboard-header { + background-color: #0094ff; + color: #f5f5f5; + text-align: center; + padding: 20px; + border-radius: 10px; + margin-bottom: 20px; +} + +.dashboard-content { + text-align: center; + margin-top: 20px; + padding: 20px; + background-color: #f9f9f9; + border-radius: 10px; +} + + .dashboard-content h2 { + margin-bottom: 10px; + } + +/* Card UI styles */ +.main { + margin: 20px; +} + +.row { + display: flex; + justify-content: space-between; + align-items: stretch; + margin-bottom: 20px; + +} + +.card { + width: calc(30.33% - 20px); + box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.2); + border-radius: 5px; + margin-right: 20px; + background-color: #fff; + overflow: hidden; + position: relative; +} + + + +*.image img{ + height: 170px; + width: 100%; + text-align: center; + border-top-left-radius: 5px; + border-top-right-radius: 5px; + +} + +.title { + text-align: center; + padding: 10px; + background-color: #333; + color: #fff; + position: relative; +} + +.scores { + text-align: center; + font-size: 14px; + color: #000000; + margin-top: 5px; +} + +h1 { + font-size: 20px; +} + +.des { + text-align: center; + padding: 10px; + border-bottom-left-radius: 5px; + border-bottom-right-radius: 5px; + background-color: #f5f5f5; +} + +button { + margin-top: 10px; + background-color: #333; + color: #fff; + border: none; + border-radius: 5px; + padding: 5px 10px; + cursor: pointer; +} + + button:hover { + background-color: #555; + } + + +/* Style for the search bar container */ +.search-bar { + display: flex; + align-items: center; + background-color: #f5f5f5; + padding: 10px; + border-radius: 5px; + box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1); +} + +/* Style for the filter dropdown container */ +.filter-dropdown { + position: relative; + margin-right: 10px; + cursor: pointer; +} + +/* Style for the filter button */ +.filter-btn { + color: #000 + background-color: #fff; + border: 1px solid #ccc; + padding: 5px 10px; + border-radius: 5px; + cursor: pointer; +} + +/* Style for the filter options */ +.filter-options { + display: none; + position: absolute; + background-color: #fff; + border: 1px solid #ccc; + border-radius: 5px; + top: 30px; + left: 0; + width: 100px; + z-index: 1; +} + + /* Style for the filter options links */ + .filter-options a { + display: block; + padding: 5px 10px; + text-decoration: none; + color: #333; + transition: background-color 0.3s; + } + + .filter-options a:hover { + background-color: #f5f5f5; + } + +/* Show filter options when the filter button is clicked */ +.filter-dropdown:hover .filter-options { + + display: block; +} + +/* Style for the search input */ +.search-input { + flex: 1; + border: none; + padding: 8px; + border-radius: 5px; +} + +/* Style for the search button */ +.search-btn { + background-color: #3498db; + border: none; + padding: 8px; + border-radius: 5px; + cursor: pointer; + transition: background-color 0.3s; +} + + .search-btn img { + width: 20px; + height: 20px; + } + + .search-btn:hover { + background-color: #217dbb; + } + +.flagged { + background-color: red; +} + +/* Add this in your styles.css */ +.compare-button { + background-color: #3498db; + color: white; + border: none; + padding: 10px 20px; + border-radius: 5px; + cursor: pointer; + font-size: 16px; + margin-top: 20px; +} diff --git a/Beta-L/Influencer_dashboard_design/visualize.html b/Beta-L/Influencer_dashboard_design/visualize.html new file mode 100644 index 0000000..265d1fe --- /dev/null +++ b/Beta-L/Influencer_dashboard_design/visualize.html @@ -0,0 +1,17 @@ + + + + Visualizations + + +

Available Visualizations

+ + Back to Dashboard + + diff --git a/Beta-L/Ml_codes/Assign_credibility_scores_to_influencers_and_claims_using_Scikit_learn_and_TensorFlow.ipynb b/Beta-L/Ml_codes/Assign_credibility_scores_to_influencers_and_claims_using_Scikit_learn_and_TensorFlow.ipynb new file mode 100644 index 0000000..49f2d88 --- /dev/null +++ b/Beta-L/Ml_codes/Assign_credibility_scores_to_influencers_and_claims_using_Scikit_learn_and_TensorFlow.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/" + }, + "id": "oGnu0h9yf9T0", + "outputId": "456cb1e0-31d3-41e2-d3a0-4501a440d68b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " If ticker of Indian Stock Market, add \".NS\" at last\n", + " For example -- \"ADANIPOWER.NS\" for Adani Power\n", + " -- \"TATAMOTORS.NS\" for Tata Motors\n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import pandas_datareader as web\n", + "import datetime as dt\n", + "from sklearn.manifold import trustworthiness\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.python.keras.models import Sequential\n", + "from tensorflow.python.keras.layers import Dense, Dropout, LSTM\n", + "\n", + "# Load Data\n", + "print(\"\"\"\n", + " If ticker of Indian Stock Market, add \".NS\" at last\n", + " For example -- \"ADANIPOWER.NS\" for Adani Power\n", + " -- \"TATAMOTORS.NS\" for Tata Motors\n", + "\"\"\")\n", + "\n", + "company = input(\"Enter ticker symbol :\" ).upper()\n", + "\n", + "start = dt.datetime(2012,1,1)\n", + "end = dt.datetime(2022,1,1)\n", + "\n", + "data = web.DataReader(company, 'yahoo', start, end)\n", + "# Prepare Data\n", + "scaler = MinMaxScaler(feature_range=(0,1))\n", + "scaled_data = scaler.fit_transform(data['Close'].values.reshape(-1,1))\n", + "prediction_days = 60\n", + "x_train = []\n", + "y_train = []\n", + "\n", + "print(\"Collecting data...\")\n", + "\n", + "for x in range(prediction_days, len(scaled_data)):\n", + " x_train.append(scaled_data[x-prediction_days:x,0])\n", + " y_train.append(scaled_data[x,0])\n", + "\n", + "# Converting to numpy arrays\n", + "\n", + "x_train, y_train = np.array(x_train), np.array(y_train)\n", + "x_train = np.reshape(x_train,(x_train.shape[0],x_train.shape[1],1))\n", + "\n", + "print(\"Initializing...\")\n", + "\n", + "# Training Model\n", + "\n", + "model = Sequential()\n", + "model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))\n", + "model.add(Dropout(0.2))\n", + "model.add(LSTM(units=50, return_sequences=True))\n", + "model.add(Dropout(0.2))\n", + "model.add(LSTM(units=50, return_sequences=True))\n", + "model.add(Dropout(0.2))\n", + "model.add(LSTM(units=50))\n", + "model.add(Dropout(0.2))\n", + "model.add(Dense(units=1)) # Prediction for next price\n", + "\n", + "\n", + "model.compile(optimizer='adam', loss='mean_squared_error')\n", + "model.fit(x_train, y_train, epochs=5, batch_size=32)\n", + "\n", + "# Testing\n", + "\n", + "test_start = dt.datetime(2020,1,1)\n", + "test_end = dt.datetime(2022,1,1)\n", + "\n", + "test_data = web.DataReader(company, 'yahoo', test_start, test_end)\n", + "actual_price = test_data['Close'].values\n", + "\n", + "total_dataset = pd.concat((data['Close'], test_data['Close']), axis=0)\n", + "\n", + "model_inputs = total_dataset[len(total_dataset) - len(test_data) - prediction_days:].values\n", + "model_inputs = model_inputs.reshape(-1,1)\n", + "model_inputs = scaler.transform(model_inputs)\n", + "\n", + "# Prediction\n", + "\n", + "x_test = []\n", + "\n", + "print(\"Predicting...\")\n", + "\n", + "for x in range(prediction_days, len(model_inputs)):\n", + " x_test.append(model_inputs[x-prediction_days:x,0])\n", + "\n", + "x_test = np.array(x_test)\n", + "x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))\n", + "\n", + "predicted_prices = model.predict(x_test)\n", + "predicted_prices = scaler.inverse_transform(predicted_prices)\n", + "\n", + "# Prediction for next day\n", + "\n", + "\n", + "real_data = [model_inputs[len(model_inputs) + 1 - prediction_days:len(model_inputs+1), 0]]\n", + "real_data = np.array(real_data)\n", + "real_data = np.reshape(real_data,(real_data.shape[0],real_data.shape[1],1))\n", + "\n", + "prediction = model.predict(real_data)\n", + "prediction = scaler.inverse_transform(prediction)\n", + "print(f\"Prediction {prediction} \")\n", + "\n", + "# Ploting the prediction\n", + "\n", + "plt.plot(actual_price, color=\"blue\", label=f\"Actual {company} price...\")\n", + "plt.plot(predicted_prices, color=\"green\", label=f\"Predicted {company} price...\")\n", + "plt.title(f'{company} share prices...')\n", + "plt.xlabel(\"Time\")\n", + "plt.ylabel(f\"{company} share price\")\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1wq-ZJRaacpk" + }, + "source": [ + "supervised algorithm for assigning credibility scores to influencers and claims using Scikit-learn and TensorFlow in Indian stock markets using machine learning and deep learning techniques:\n", + "\n", + "1)Collect a dataset of historical data on stock prices, influencers, and claims. This data can be scraped from social media platforms, news websites, and other sources.\n", + "2)Preprocess the data by cleaning it and removing any noise.\n", + "3)Feature engineer the data by extracting the relevant features. The features that you can consider include:\n", + "Whether the influencer is registered as an advisor with SEBI\n", + "Whether they follow SEBI’s registered advisor guidelines\n", + "Whether they have any action pending from SEBI\n", + "Whether they disclose any commission or conflict of interest\n", + "Whether they provide evidence or references for their claim\n", + "Whether they have a consistent track record of performance\n", + "Whether they have a large and engaged following\n", + "4)Train a supervised learning model on the preprocessed data. You can use a model such as logistic regression, support vector machines, or random forests.\n", + "5)Evaluate the performance of the model on a held-out test set.\n", + "6)Use the model to assign credibility scores to new influencers and claims.\n", + "\n", + "Here are some specific algorithms that you could use:\n", + "\n", + "Logistic regression: This is a simple but effective algorithm that can be used for binary classification tasks. In this case, the binary classification task would be to classify whether an influencer or claim is credible or not.\n", + "Support vector machine: This is a more powerful algorithm that can be used for both binary and multi-class classification tasks. It is generally more accurate than logistic regression, but it can also be more computationally expensive.\n", + "Random forest: This is an ensemble learning algorithm that combines the predictions of multiple decision trees. It is often more accurate than logistic regression or support vector machines, but it can also be more computationally expensive.\n", + "The specific algorithm that you choose will depend on the size and complexity of your dataset, as well as the desired accuracy of the model.\n", + "\n", + "Scikit-learn is a popular Python library for machine learning. It provides implementations of many supervised learning algorithms, including logistic regression, support vector machines, and random forests. TensorFlow is another popular Python library for machine learning. It is a more general-purpose library that can be used for both supervised and unsupervised learning tasks.\n", + "\n", + "The following are some of the challenges that you might face when developing a supervised algorithm for assigning credibility scores to influencers and claims in Indian stock markets:\n", + "\n", + "The data is often noisy and incomplete.\n", + "The data is often biased.\n", + "The data is constantly changing.\n", + "It can be difficult to define what constitutes a credible influencer or claim.\n", + "Despite these challenges, it is possible to develop a supervised algorithm for assigning credibility scores to influencers and claims in Indian stock markets. By carefully cleaning and feature engineering the data, and by using a supervised learning model, you can develop a model that can be used to assign credibility scores to new influencers and claims with a high degree of accuracy.\n", + "\n", + "Here are some additional considerations for developing a supervised algorithm for assigning credibility scores to influencers and claims in Indian stock markets:\n", + "\n", + "The algorithm should be transparent and accountable. Users should be able to understand how the algorithm works and why it assigns the credibility scores that it does.\n", + "The algorithm should be fair and unbiased. It should not discriminate against any particular group of influencers or claims.\n", + "The algorithm should be robust to changes in the data. It should continue to work accurately even if the data changes over time." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Beta-L/Ml_codes/Extract_relevant_information_from_the_data_using_NLTK.ipynb b/Beta-L/Ml_codes/Extract_relevant_information_from_the_data_using_NLTK.ipynb new file mode 100644 index 0000000..b76fe97 --- /dev/null +++ b/Beta-L/Ml_codes/Extract_relevant_information_from_the_data_using_NLTK.ipynb @@ -0,0 +1,119 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "import nltk\n", + "\n", + "def extract_relevant_information(text):\n", + " \"\"\"Extracts relevant information from the text using NLTK.\n", + "\n", + " Args:\n", + " text: The text to extract information from.\n", + "\n", + " Returns:\n", + " A dictionary of extracted information.\n", + " \"\"\"\n", + "\n", + " # Tokenize the text\n", + " tokens = nltk.word_tokenize(text)\n", + "\n", + " # Find the name and profile of the influencer\n", + " influencer = tokens[tokens.index(\"@\") + 1]\n", + " profile = \"Twitter user\"\n", + "\n", + " # Find the content and tone of the claim\n", + " claim = \" \".join(tokens[tokens.index(\"that\") + 1:])\n", + " tone = \"negative\"\n", + "\n", + " # Find the date and time of the post\n", + " date = tokens[tokens.index(\"on\") + 1]\n", + " time = tokens[tokens.index(\"at\") + 1]\n", + "\n", + " # Find the number and sentiment of the comments\n", + " number_of_comments = int(tokens[tokens.index(\"over\") + 1])\n", + " sentiment_of_comments = \"negative\"\n", + "\n", + " # Create a dictionary of extracted information\n", + " extracted_information = {\n", + " \"influencer\": influencer,\n", + " \"profile\": profile,\n", + " \"claim\": claim,\n", + " \"tone\": tone,\n", + " \"date\": date,\n", + " \"time\": time,\n", + " \"number_of_comments\": number_of_comments,\n", + " \"sentiment_of_comments\": sentiment_of_comments,\n", + " }\n", + "\n", + " return extracted_information\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Get the text\n", + " text = \"Influencer @johndoe posted a claim on Twitter that the new COVID-19 vaccine is dangerous. The post received over 1000 comments, most of which were negative.\"\n", + "\n", + " # Extract the relevant information\n", + " extracted_information = extract_relevant_information(text)\n", + "\n", + " # Print the extracted information\n", + " print(extracted_information)\n" + ], + "metadata": { + "id": "pUguyw0sX3F_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Introduction :extracting relevant information from the data using NLTK:\n", + "\n", + "1)Tokenization: This is the process of breaking the text into individual words or tokens. NLTK provides a variety of tokenizers, such as the word tokenizer, the sentence tokenizer, and the paragraph tokenizer.\n", + "\n", + "2)Stemming: This is the process of reducing a word to its root form. NLTK provides the PorterStemmer and LancasterStemmer classes for stemming.\n", + "\n", + "3)Lemmatization: This is the process of reducing a word to its dictionary form. NLTK provides the WordNetLemmatizer class for lemmatization.\n", + "Part-of-speech tagging: This is the process of assigning a part-of-speech tag to each word in the text. NLTK provides the POSTagger class for part-of-speech tagging.\n", + "\n", + "4)Named entity recognition: This is the process of identifying named entities in the text, such as people, organizations, and places. NLTK provides the NERecognizer class for named entity recognition.\n", + "\n", + "5)Sentiment analysis: This is the process of determining the sentiment of a piece of text, such as whether it is positive, negative, or neutral. NLTK provides the SentimentAnalyzer class for sentiment analysis.\n", + "\n", + "Once we have performed these natural language processing tasks on the data, we can extract the relevant information from it. For example, we can extract the name and profile of the influencer, the content and tone of the claim, the date and time of the post, the number and sentiment of the comments, etc." + ], + "metadata": { + "id": "FzR3QvQeX8X4" + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "_C-U_vDgXlmU" + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "xqnHN_Y-XdU0" + } + } + ] +} \ No newline at end of file diff --git a/Beta-L/Ml_codes/Stock_direction_prediction.ipynb b/Beta-L/Ml_codes/Stock_direction_prediction.ipynb new file mode 100644 index 0000000..695db5c --- /dev/null +++ b/Beta-L/Ml_codes/Stock_direction_prediction.ipynb @@ -0,0 +1,1821 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Stock direction prediction.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "oMXGjAmvm3bw", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 357 + }, + "outputId": "b523ba6e-cb40-4fce-bd41-13ef03aa45b3" + }, + "source": [ + "!nvidia-smi" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mon Sep 14 14:38:42 2020 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 450.66 Driver Version: 418.67 CUDA Version: 10.1 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 34C P8 9W / 70W | 0MiB / 15079MiB | 0% Default |\n", + "| | | ERR! |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fvMiVpfanBxq", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "outputId": "186d1ce5-9daf-4a27-a97c-94a06b814f1a" + }, + "source": [ + "import ta\n", + "import tqdm\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.gaussian_process import GaussianProcessClassifier\n", + "from sklearn.gaussian_process.kernels import RBF\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score\n", + "\n", + "import tensorflow as tf\n", + "print(f'TensorFlow version: {tf.__version__}')" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "TensorFlow version: 2.3.0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_MNmGI3VnEmE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "outputId": "51fae6e2-ae21-4182-981d-33f9c73dea76" + }, + "source": [ + "!wget https://raw.githubusercontent.com/dksifoua/Stock-Market-Prediction/master/data/2019_AAPL_1min.csv" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "--2020-09-14 14:31:21-- https://raw.githubusercontent.com/dksifoua/Stock-Market-Prediction/master/data/2019_AAPL_1min.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6092763 (5.8M) [text/plain]\n", + "Saving to: ‘2019_AAPL_1min.csv.1’\n", + "\n", + "2019_AAPL_1min.csv. 100%[===================>] 5.81M 34.4MB/s in 0.2s \n", + "\n", + "2020-09-14 14:31:22 (34.4 MB/s) - ‘2019_AAPL_1min.csv.1’ saved [6092763/6092763]\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yrN21LvAnGLE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 252 + }, + "outputId": "dca59956-fa06-4ffa-a09a-2e807f9e923e" + }, + "source": [ + "df = pd.read_csv('./2019_AAPL_1min.csv', header=0, index_col=0)\n", + "df.index = pd.to_datetime(df.index).tz_localize(None).to_period('T')\n", + "df = df.drop(['open', 'high', 'low'], axis=1)\n", + "print(df.shape)\n", + "df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(101081, 2)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
closevolume
date
2019-01-02 14:30154.7803223.0
2019-01-02 14:31155.3251674.0
2019-01-02 14:32154.8503153.0
2019-01-02 14:33154.6005104.0
2019-01-02 14:34154.7602948.0
\n", + "
" + ], + "text/plain": [ + " close volume\n", + "date \n", + "2019-01-02 14:30 154.780 3223.0\n", + "2019-01-02 14:31 155.325 1674.0\n", + "2019-01-02 14:32 154.850 3153.0\n", + "2019-01-02 14:33 154.600 5104.0\n", + "2019-01-02 14:34 154.760 2948.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D92HLEwqoDHL", + "colab_type": "text" + }, + "source": [ + "# Data processing\n", + "\n", + "**Add targets**\n", + "\n", + "The target to be predicted in the $i^{th}$ day is calculated as follows:\n", + "\n", + "$$target_i = sign(P_{i+d} - P_i)$$\n", + "\n", + "Where $d$ is the number of minutes/days after which the prediction is to be made" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TTJxOQPbnJHn", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "8adcf5b0-c99a-404b-cd59-6b49966e0d2b" + }, + "source": [ + "df['label'] = df.close.shift(-1) - df.close\n", + "df.label = df.label.apply(lambda x: 0 if x < 0 else 1)\n", + "df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
closevolumelabel
date
2019-01-02 14:30154.7803223.01
2019-01-02 14:31155.3251674.00
2019-01-02 14:32154.8503153.00
2019-01-02 14:33154.6005104.01
2019-01-02 14:34154.7602948.00
\n", + "
" + ], + "text/plain": [ + " close volume label\n", + "date \n", + "2019-01-02 14:30 154.780 3223.0 1\n", + "2019-01-02 14:31 155.325 1674.0 0\n", + "2019-01-02 14:32 154.850 3153.0 0\n", + "2019-01-02 14:33 154.600 5104.0 1\n", + "2019-01-02 14:34 154.760 2948.0 0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZTi4U9H9oJz6", + "colab_type": "text" + }, + "source": [ + "**Technical indicators**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EJxdpD21qXuf", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 425 + }, + "outputId": "7ecb8cdd-96a7-448f-d85e-be667297fcce" + }, + "source": [ + "# Momentum indicators\n", + "df['roc'] = ta.momentum.roc(close=df.close) # Rate of Change (ROC)\n", + "df['rsi'] = ta.momentum.rsi(close=df.close) # Relative Strength Index (RSI)\n", + "df['tsi'] = ta.momentum.tsi(close=df.close) # True strength index (TSI)\n", + "\n", + "# Volatility indicators\n", + "bb_indicator = ta.volatility.BollingerBands(close=df.close)\n", + "df['bb_bbhi'] = bb_indicator.bollinger_hband_indicator() # Bollinger Band high indicator\n", + "df['bb_bbli'] = bb_indicator.bollinger_lband_indicator() # Bollinger Band low indicator\n", + "\n", + "# Trend indicators\n", + "aroon_indicator = ta.trend.AroonIndicator(close=df.close)\n", + "macd_indicator = ta.trend.MACD(close=df.close)\n", + "kst_indicator = ta.trend.KSTIndicator(close=df.close)\n", + "df['aroon_down'] = aroon_indicator.aroon_down() # Aroon Down Channel\n", + "df['aroon'] = aroon_indicator.aroon_indicator() # Aroon Indicator\n", + "df['aroon_up'] = aroon_indicator.aroon_up() # Aroon Up Channel\n", + "df['macd_line'] = macd_indicator.macd() # MACD Line\n", + "df['macd_hist'] = macd_indicator.macd_diff() # MACD Histogram\n", + "df['macd_signal'] = macd_indicator.macd_signal() # MACD Signal Line\n", + "df['kst'] = kst_indicator.kst() # Know Sure Thing (KST)\n", + "df['kst_diff'] = kst_indicator.kst_diff() # Diff Know Sure Thing (KST)\n", + "df['kst_signal'] = kst_indicator.kst_sig() # Signal Line Know Sure Thing (KST)\n", + "df['dpo'] = ta.trend.dpo(close=df.close) # Detrended Price Oscillator (DPO)\n", + "df['trix'] = ta.trend.trix(close=df.close) # Trix (TRIX)\n", + "df['sma_10'] = ta.trend.sma_indicator(close=df.close, n=10) # SMA n=10\n", + "df['sma_20'] = ta.trend.sma_indicator(close=df.close, n=20) # SMA n=20\n", + "df['sma_30'] = ta.trend.sma_indicator(close=df.close, n=30) # SMA n=30\n", + "df['sma_60'] = ta.trend.sma_indicator(close=df.close, n=60) # SMA n=60\n", + "df['ema_10'] = ta.trend.sma_indicator(close=df.close, n=10) # EMA n=10\n", + "df['ema_20'] = ta.trend.sma_indicator(close=df.close, n=20) # EMA n=20\n", + "df['ema_30'] = ta.trend.sma_indicator(close=df.close, n=30) # EMA n=30\n", + "df['ema_60'] = ta.trend.sma_indicator(close=df.close, n=60) # EMA n=60\n", + "\n", + "# Volume indicators\n", + "df['obv'] = ta.volume.on_balance_volume(close=df.close, volume=df.volume) # On Balance Volume (OBV)\n", + "df['vpt'] = ta.volume.volume_price_trend(close=df.close, volume=df.volume) # Volume-price trend (VPT)\n", + "df['fi'] = ta.volume.force_index(close=df.close, volume=df.volume) # Force Index (FI)\n", + "df['nvi'] = ta.volume.negative_volume_index(close=df.close, volume=df.volume) # Negative Volume Index (NVI)\n", + "\n", + "df.tail()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
closevolumelabelrocrsitsibb_bbhibb_bbliaroon_downaroonaroon_upmacd_linemacd_histmacd_signalkstkst_diffkst_signaldpotrixsma_10sma_20sma_30sma_60ema_10ema_20ema_30ema_60obvvptfinvi
date
2019-12-31 20:55293.5906260.000.08010866.45207612.8847741.00.020.080.0100.00.0657030.0041580.0615440.656730-0.1032650.7599950.240750.002978293.2490293.23425293.192000292.980833293.2490293.23425293.192000292.9808338353065.07.131644271.7492661194.559824
2019-12-31 20:56293.4457623.00-0.01022259.37918514.0098200.00.016.080.096.00.0755450.0112000.0643440.648568-0.0889360.7375040.186250.003056293.2515293.25375293.201000292.999083293.2515293.25375293.201000292.9990838345442.00.71598575.0229421194.559824
2019-12-31 20:57293.3705036.01-0.02385556.05575513.6359370.00.012.080.092.00.0764120.0096540.0667580.619311-0.0916960.7110080.153750.003134293.2525293.26625293.206667293.017167293.2525293.26625293.206667293.0171678340406.0-5.05201710.3482361194.254513
2019-12-31 20:58293.4505485.010.01022458.71039914.2895970.00.08.080.088.00.0826020.0126760.0699270.626898-0.0572420.6841400.074500.003255293.2705293.28550293.214000293.036583293.2705293.28550293.214000293.0365838345891.00.20859871.5556311194.254513
2019-12-31 20:59293.6204139.010.08862863.72519416.6672670.00.04.096.0100.00.1000720.0241160.0759560.6633790.0007750.662605-0.042750.003506293.3100293.31275293.229000293.058167293.3100293.31275293.229000293.0581678350030.03.893507161.8519691194.946362
\n", + "
" + ], + "text/plain": [ + " close volume label ... vpt fi nvi\n", + "date ... \n", + "2019-12-31 20:55 293.590 6260.0 0 ... 7.131644 271.749266 1194.559824\n", + "2019-12-31 20:56 293.445 7623.0 0 ... 0.715985 75.022942 1194.559824\n", + "2019-12-31 20:57 293.370 5036.0 1 ... -5.052017 10.348236 1194.254513\n", + "2019-12-31 20:58 293.450 5485.0 1 ... 0.208598 71.555631 1194.254513\n", + "2019-12-31 20:59 293.620 4139.0 1 ... 3.893507 161.851969 1194.946362\n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DtT-QGfdwwvn", + "colab_type": "text" + }, + "source": [ + "**Datetime cyclical encoding**\n", + "\n", + "$$x_{sin} = \\sin(\\frac{2*\\pi*x}{max(x)})$$\n", + "\n", + "$$x_{cos} = \\cos(\\frac{2*\\pi*x}{max(x)})$$" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uRbPhrMZ07I2", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 442 + }, + "outputId": "a04862d1-4438-4388-aeb0-da8c82862677" + }, + "source": [ + "df['datetime'] = df.index.to_timestamp()\n", + "df['min_sin'] = np.sin(2 * np.pi * df.datetime.dt.minute / 60)\n", + "df['min_cos'] = np.cos(2 * np.pi * df.datetime.dt.minute / 60)\n", + "df['hour_sin'] = np.sin(2 * np.pi * df.datetime.dt.hour / 60)\n", + "df['hour_cos'] = np.cos(2 * np.pi * df.datetime.dt.hour / 60)\n", + "df['day_sin'] = np.sin(2 * np.pi * df.datetime.dt.day / 30)\n", + "df['day_cos'] = np.cos(2 * np.pi * df.datetime.dt.day / 30)\n", + "df['month_sin'] = np.sin(2 * np.pi * df.datetime.dt.month / 12)\n", + "df['month_cos'] = np.cos(2 * np.pi * df.datetime.dt.month / 12)\n", + "\n", + "df = df.drop(['datetime'], axis=1)\n", + "print(df.shape)\n", + "df.tail()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(101081, 39)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
closevolumelabelrocrsitsibb_bbhibb_bbliaroon_downaroonaroon_upmacd_linemacd_histmacd_signalkstkst_diffkst_signaldpotrixsma_10sma_20sma_30sma_60ema_10ema_20ema_30ema_60obvvptfinvimin_sinmin_coshour_sinhour_cosday_sinday_cosmonth_sinmonth_cos
date
2019-12-31 20:55293.5906260.000.08010866.45207612.8847741.00.020.080.0100.00.0657030.0041580.0615440.656730-0.1032650.7599950.240750.002978293.2490293.23425293.192000292.980833293.2490293.23425293.192000292.9808338353065.07.131644271.7492661194.559824-0.5000000.8660250.866025-0.50.2079120.978148-2.449294e-161.0
2019-12-31 20:56293.4457623.00-0.01022259.37918514.0098200.00.016.080.096.00.0755450.0112000.0643440.648568-0.0889360.7375040.186250.003056293.2515293.25375293.201000292.999083293.2515293.25375293.201000292.9990838345442.00.71598575.0229421194.559824-0.4067370.9135450.866025-0.50.2079120.978148-2.449294e-161.0
2019-12-31 20:57293.3705036.01-0.02385556.05575513.6359370.00.012.080.092.00.0764120.0096540.0667580.619311-0.0916960.7110080.153750.003134293.2525293.26625293.206667293.017167293.2525293.26625293.206667293.0171678340406.0-5.05201710.3482361194.254513-0.3090170.9510570.866025-0.50.2079120.978148-2.449294e-161.0
2019-12-31 20:58293.4505485.010.01022458.71039914.2895970.00.08.080.088.00.0826020.0126760.0699270.626898-0.0572420.6841400.074500.003255293.2705293.28550293.214000293.036583293.2705293.28550293.214000293.0365838345891.00.20859871.5556311194.254513-0.2079120.9781480.866025-0.50.2079120.978148-2.449294e-161.0
2019-12-31 20:59293.6204139.010.08862863.72519416.6672670.00.04.096.0100.00.1000720.0241160.0759560.6633790.0007750.662605-0.042750.003506293.3100293.31275293.229000293.058167293.3100293.31275293.229000293.0581678350030.03.893507161.8519691194.946362-0.1045280.9945220.866025-0.50.2079120.978148-2.449294e-161.0
\n", + "
" + ], + "text/plain": [ + " close volume label ... day_cos month_sin month_cos\n", + "date ... \n", + "2019-12-31 20:55 293.590 6260.0 0 ... 0.978148 -2.449294e-16 1.0\n", + "2019-12-31 20:56 293.445 7623.0 0 ... 0.978148 -2.449294e-16 1.0\n", + "2019-12-31 20:57 293.370 5036.0 1 ... 0.978148 -2.449294e-16 1.0\n", + "2019-12-31 20:58 293.450 5485.0 1 ... 0.978148 -2.449294e-16 1.0\n", + "2019-12-31 20:59 293.620 4139.0 1 ... 0.978148 -2.449294e-16 1.0\n", + "\n", + "[5 rows x 39 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mmkRpUsR2IAm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 442 + }, + "outputId": "599d5571-a718-4d77-b170-29bb1fc23041" + }, + "source": [ + "df_na = df.dropna(axis=0)\n", + "print(df_na.shape)\n", + "df_na.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(101022, 39)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
closevolumelabelrocrsitsibb_bbhibb_bbliaroon_downaroonaroon_upmacd_linemacd_histmacd_signalkstkst_diffkst_signaldpotrixsma_10sma_20sma_30sma_60ema_10ema_20ema_30ema_60obvvptfinvimin_sinmin_coshour_sinhour_cosday_sinday_cosmonth_sinmonth_cos
date
2019-01-02 15:29155.7651538.010.07709955.23875129.6705000.00.04.080.084.00.176555-0.0127390.1892954.072616-0.3664354.439051-0.036000.018094155.8260155.72600155.498833155.187667155.8260155.72600155.498833155.18766712576.0-0.489252-24.6130281012.2248091.045285e-01-0.9945221.02.832769e-160.4067370.9135450.50.866025
2019-01-02 15:30155.9351765.000.15736459.65975928.4818020.00.012.068.080.00.174976-0.0114550.1864314.052878-0.3440264.396904-0.050000.017853155.8505155.74000155.536500155.206917155.8505155.74000155.536500155.20691714341.00.20031521.7674051012.2248095.665539e-16-1.0000001.02.832769e-160.4067370.9135450.50.866025
2019-01-02 15:31155.740767.010.03211553.17229625.2961600.00.08.068.076.00.156189-0.0241940.1803823.948436-0.3771544.325591-0.064000.017344155.8555155.75400155.569833155.213833155.8555155.75400155.569833155.21383313574.00.967150-2.7086531010.959000-1.045285e-01-0.9945221.02.832769e-160.4067370.9135450.50.866025
2019-01-02 15:32155.8201805.000.08349955.31892623.3802780.00.04.068.072.00.146071-0.0274490.1735203.782034-0.4537954.235828-0.074750.016742155.8550155.76475155.606667155.230000155.8550155.76475155.606667155.23000015379.0-0.03196318.3068691010.959000-2.079117e-01-0.9781481.02.832769e-160.4067370.9135450.50.866025
2019-01-02 15:33155.695740.010.00321251.35740520.5162700.00.012.056.068.00.126508-0.0376090.1641183.544114-0.5760644.1201780.059250.015916155.8560155.76575155.639333155.248250155.8560155.76575155.639333155.24825014639.00.3335532.4773161010.148001-3.090170e-01-0.9510571.02.832769e-160.4067370.9135450.50.866025
\n", + "
" + ], + "text/plain": [ + " close volume label ... day_cos month_sin month_cos\n", + "date ... \n", + "2019-01-02 15:29 155.765 1538.0 1 ... 0.913545 0.5 0.866025\n", + "2019-01-02 15:30 155.935 1765.0 0 ... 0.913545 0.5 0.866025\n", + "2019-01-02 15:31 155.740 767.0 1 ... 0.913545 0.5 0.866025\n", + "2019-01-02 15:32 155.820 1805.0 0 ... 0.913545 0.5 0.866025\n", + "2019-01-02 15:33 155.695 740.0 1 ... 0.913545 0.5 0.866025\n", + "\n", + "[5 rows x 39 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "wbd8i3CNzs_6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "labels = df_na.label\n", + "df_na = df_na.drop(['label'], axis=1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "FV5yPf1s4_EL", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 136 + }, + "outputId": "11a849ba-67bf-4868-8617-ec89811b41a0" + }, + "source": [ + "df_na.columns" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['close', 'volume', 'roc', 'rsi', 'tsi', 'bb_bbhi', 'bb_bbli',\n", + " 'aroon_down', 'aroon', 'aroon_up', 'macd_line', 'macd_hist',\n", + " 'macd_signal', 'kst', 'kst_diff', 'kst_signal', 'dpo', 'trix', 'sma_10',\n", + " 'sma_20', 'sma_30', 'sma_60', 'ema_10', 'ema_20', 'ema_30', 'ema_60',\n", + " 'obv', 'vpt', 'fi', 'nvi', 'min_sin', 'min_cos', 'hour_sin', 'hour_cos',\n", + " 'day_sin', 'day_cos', 'month_sin', 'month_cos'],\n", + " dtype='object')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JXItrTrS2ub0", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "2a043371-ccdd-4e9e-94a2-ed11dcd194fc" + }, + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df_na.values, labels.values, test_size=0.05, random_state=42)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((95970, 38), (5052, 38), (95970,), (5052,))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "sIJSJNW75Kog", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "709bba3f-1769-415b-f42f-c2a6fe606982" + }, + "source": [ + "scaler = StandardScaler()\n", + "scaler.fit(X_train)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "StandardScaler(copy=True, with_mean=True, with_std=True)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gSSNnMFA5dCd", + "colab_type": "code", + "colab": {} + }, + "source": [ + "X_train_scaled = scaler.transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "48fcB-RX5k_a", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "92abcade-7862-4b8a-e320-3fa75cc3439c" + }, + "source": [ + "pca = PCA(n_components=0.8, random_state=42)\n", + "pca.fit(X_train_scaled)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "PCA(copy=True, iterated_power='auto', n_components=0.8, random_state=42,\n", + " svd_solver='auto', tol=0.0, whiten=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZQVfl8Yz6LLu", + "colab_type": "code", + "colab": {} + }, + "source": [ + "X_train_pca = pca.transform(X_train_scaled)\n", + "X_test_pca = pca.transform(X_test_scaled)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zt0c2ae46u0S", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "e46ce866-7bfa-430a-db9b-3488cc11b5ee" + }, + "source": [ + "X_train_pca.shape, X_test_pca.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((95970, 10), (5052, 10))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "38ZISGqG6yWD", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "676c96f7-bff5-47fc-eb09-a4ae1118ca97" + }, + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train_pca, y_train)\n", + "y_pred = model.predict_proba(X_test_pca)\n", + "acc = accuracy_score(y_test, y_pred.argmax(axis=1))\n", + "f1 = f1_score(y_test, y_pred.argmax(axis=1))\n", + "roc = roc_auc_score(y_test, y_pred[:, 1])\n", + "print(f'LogisticRegression: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "LogisticRegression: acc=56.57% - f1=71.40% - roc=54.00%\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "MQ5ZHLbg_Kbo", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "567ea292-8ee8-42e8-930d-3baa60067ded" + }, + "source": [ + "model = DecisionTreeClassifier()\n", + "model.fit(X_train_pca, y_train)\n", + "y_pred = model.predict_proba(X_test_pca)\n", + "acc = accuracy_score(y_test, y_pred.argmax(axis=1))\n", + "f1 = f1_score(y_test, y_pred.argmax(axis=1))\n", + "roc = roc_auc_score(y_test, y_pred[:, 1])\n", + "print(f'LogisticRegression: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "LogisticRegression: acc=53.42% - f1=59.31% - roc=52.43%\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NzTQbksY_8_U", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "83b89cdf-6a4f-46ac-d717-e40ddb33b29e" + }, + "source": [ + "model = RandomForestClassifier()\n", + "model.fit(X_train_pca, y_train)\n", + "y_pred = model.predict_proba(X_test_pca)\n", + "acc = accuracy_score(y_test, y_pred.argmax(axis=1))\n", + "f1 = f1_score(y_test, y_pred.argmax(axis=1))\n", + "roc = roc_auc_score(y_test, y_pred[:, 1])\n", + "print(f'RandomForestClassifier: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "RandomForest: acc=55.19% - f1=62.85% - roc=56.71%\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5r7tdrYUARXv", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "3646033b-5891-4236-e5fd-23685f26bf79" + }, + "source": [ + "model = GradientBoostingClassifier()\n", + "model.fit(X_train_pca, y_train)\n", + "y_pred = model.predict_proba(X_test_pca)\n", + "acc = accuracy_score(y_test, y_pred.argmax(axis=1))\n", + "f1 = f1_score(y_test, y_pred.argmax(axis=1))\n", + "roc = roc_auc_score(y_test, y_pred[:, 1])\n", + "print(f'GradientBoostingClassifier: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "GradientBoostingClassifier: acc=56.77% - f1=71.75% - roc=56.97%\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aO7TrJ4XCFhM", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "da995af5-5b01-45e3-faa5-a846a8890c95" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/bin/bash: conda: command not found\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file diff --git a/Beta-L/Web_scrapping/search_queries.txt b/Beta-L/Web_scrapping/search_queries.txt new file mode 100644 index 0000000..4eb682a --- /dev/null +++ b/Beta-L/Web_scrapping/search_queries.txt @@ -0,0 +1,45 @@ +SEBI regulations explained +SEBI guidelines for investors +SEBI compliance tips +SEBI rules for stock market +SEBI regulations simplified +best mutual funds to invest in +mutual fund investment tips +mutual funds explained +SEBI guidelines for mutual funds +mutual funds expert advice +stock market analysis tutorial +technical analysis of stocks +fundamental analysis of stocks +SEBI regulations for stock analysis +stock market tips from experts +personal finance planning tips +investment planning strategies +financial goals setting +SEBI guidelines for financial planning +financial planning for beginners +best trading strategies explained +intraday trading tips and tricks +swing trading techniques +SEBI regulations for trading +successful trading strategies +investment trading influencer +finance and trading influencer +investment expert +stock market guru +crypto trading influencer +forex trading expert +options trading influencer +day trading guru +technical analysis expert +long-term investing influencer +value investing guru +dividend investing expert +swing trading influencer +market analysis influencer +economic trends expert +macroeconomic analysis guru +financial literacy influencer +investing basics expert +trading tutorials guru + diff --git a/Beta-L/Web_scrapping/web_scrapping b/Beta-L/Web_scrapping/web_scrapping new file mode 100644 index 0000000..6a9d4c1 --- /dev/null +++ b/Beta-L/Web_scrapping/web_scrapping @@ -0,0 +1,124 @@ +import mysql.connector +from googleapiclient.discovery import build +from pytube import YouTube + +# MySQL database connection parameters +import ssl + +# Disable SSL certificate verification +ssl._create_default_https_context = ssl._create_unverified_context + + +db_params = { + "host": "sebi-hackathon.mysql.database.azure.com", + "user": "mysql", + "password": "Betateam-L", + "database": "youtube_data" +} + +# YouTube Data API key +api_key = "AIzaSyD8Eo2iz2butc0qZN6EHRdHlRAy__9FJ6Y" + +# Create a MySQL database connection +db_connection = mysql.connector.connect(**db_params) +db_connection.set_charset_collation('utf8mb4', 'utf8mb4_unicode_ci') +db_cursor = db_connection.cursor() + +def video_exists(video_url): + select_query = "SELECT COUNT(*) FROM youtube_videos WHERE video_url = %s" + db_cursor.execute(select_query, (video_url,)) + count = db_cursor.fetchone()[0] + return count > 0 + + +def extract_and_store_youtube_data(video_url): + try: + # Create a YouTube object + yt = YouTube(video_url) + video_id = yt.video_id + + if not video_exists(video_id): + # Extract video details + video_title = yt.title + video_description = yt.description + + # Fetch video metrics using YouTube Data API + youtube = build("youtube", "v3", developerKey=api_key) + response = youtube.videos().list(part="statistics,snippet", id=video_id).execute() + video_stats = response["items"][0]["statistics"] + video_likes = video_stats.get("likeCount", 0) + video_dislikes = video_stats.get("dislikeCount", 0) + video_comment_count = video_stats.get("commentCount", 0) + + # Fetch video comments using YouTube Data API + comments_response = youtube.commentThreads().list(part="snippet", videoId=video_id).execute() + video_comments = [comment["snippet"]["topLevelComment"]["snippet"]["textDisplay"] for comment in comments_response.get("items", [])] + + # Extract YouTuber's name and channel name + # channel_title = yt.author + channel_name_response = youtube.channels().list(part="snippet", id=yt.channel_id).execute() + channel_name = channel_name_response["items"][0]["snippet"]["title"] + + # Store data in MySQL database + insert_query = """ + INSERT INTO youtube_videos (title, description, likes, dislikes, comment_count, comments, video_url, channel_name, video_id) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + values = (video_title, video_description, video_likes, video_dislikes, video_comment_count, "\n".join(video_comments), video_url, channel_name, video_id) + + db_cursor.execute(insert_query, values) + db_connection.commit() + + print("Data inserted successfully!") + else: + print("Video already exists in the database.") + + except Exception as e: + print("An error occurred:", str(e)) + + +def fetch_search_queries_from_file(file_path): + with open(file_path, "r") as file: + search_queries = file.read().splitlines() + return search_queries + +def fetch_and_store_fintech_influencer_data(): + # Fetch fintech influencer video URLs using the YouTube Data API + youtube = build("youtube", "v3", developerKey=api_key) + search_queries = fetch_search_queries_from_file("search_queries.txt") + + + for query in search_queries: + next_page_token = None + while True: + search_response = youtube.search().list( + q=query, + part="id", + maxResults=50, # Maximum number of results per request + pageToken=next_page_token + ).execute() + + for item in search_response["items"]: + try: + video_id = item["id"]["videoId"] + video_url = f"https://www.youtube.com/watch?v={video_id}" + extract_and_store_youtube_data(video_url) + except KeyError as e: + print("Error extracting videoId:", e) + print("Item dictionary:", item) + + # Check if there are more pages of results + next_page_token = search_response.get("nextPageToken") + if not next_page_token: + break # No more results to fetch + +def main(): + fetch_and_store_fintech_influencer_data() + +# Run the main function +if __name__ == "__main__": + main() + +# Close the database connection +db_cursor.close() +db_connection.close()