diff --git a/2022_AMLI_capstone_template.pptx b/2022_AMLI_capstone_template.pptx new file mode 100644 index 0000000..2d22769 Binary files /dev/null and b/2022_AMLI_capstone_template.pptx differ diff --git a/Capstone Project Group 4 Decision Log.pdf b/Capstone Project Group 4 Decision Log.pdf new file mode 100644 index 0000000..f14cdef Binary files /dev/null and b/Capstone Project Group 4 Decision Log.pdf differ diff --git a/CapstoneProjectGitHubFinish.ipynb b/CapstoneProjectGitHubFinish.ipynb new file mode 100644 index 0000000..d837e2a --- /dev/null +++ b/CapstoneProjectGitHubFinish.ipynb @@ -0,0 +1,186 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "CapstoneProjectGitHubFinish.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + " #PART 1: Display 100 Top Words in Subset of 20 News Groups Data set\n", + "\n", + "#Step 1 Import all the functions needed\n", + "from operator import index\n", + "import pandas as pd\n", + "from sklearn import datasets\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "import numpy as np\n", + "from pprint import pprint\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import ssl\n", + "\n", + "ssl._create_default_https_context = ssl._create_unverified_context\n", + "\n", + "#Step 2: Import the dataset witht the 20 NewsGroups\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "\n", + "\n", + "newsgroups_train = fetch_20newsgroups()\n", + "print(newsgroups_train.keys())\n", + "\n", + "\n", + "FEATURES = newsgroups_train['filenames']\n", + "TARGET = 'titles'\n", + "\n", + "FEATURES, TARGET\n", + "\n", + "#Step 3: Take the Imported Newsgroups and transform the data into a dataframe.\n", + "print(newsgroups_train.keys())\n", + "FEATURES = newsgroups_train['filenames']\n", + "TARGET = 'titles'\n", + "FEATURES, TARGET\n", + "print(len(newsgroups_train['data']))\n", + "df = pd.DataFrame(newsgroups_train['data'])\n", + "df\n", + "df[TARGET] = newsgroups_train['target']\n", + "df\n", + "\n", + "#Step 4: Perform Exploratory Data Analyis on the data\n", + "df.head()\n", + "df.isnull().sum()\n", + "\n", + "#Step 5: Isolate the three needed subsets of the 20 NewsGroups, list them, & target the NewsGroups.\n", + "target_name_index = [newsgroups_train.target_names.index('talk.religion.misc'),newsgroups_train.target_names.index('sci.electronics'), newsgroups_train.target_names.index('rec.motorcycles') ]\n", + "word_index = np.where(newsgroups_train.target == target_name_index[0])[0]\n", + "speak_index = np.where(newsgroups_train.target == target_name_index[1])[0]\n", + "refrence_index = np.where(newsgroups_train.target == target_name_index [2])[0]\n", + "target_index = np.append(np.append(word_index, speak_index), refrence_index)\n", + "target_documents = [newsgroups_train.data[x] for x in target_index]\n", + "\n", + "#Step 6: Display the top 100 Words\n", + "cv = CountVectorizer(stop_words='english', max_features=100)\n", + "words_bag = cv.fit_transform(target_documents)\n", + "sum_of_words = words_bag.sum(axis=0)\n", + "word_frequency = [(word, sum_of_words[0, idx]) for word, idx in cv.vocabulary_.items()]\n", + "word_frequency = sorted(word_frequency, key = lambda x: x[1], reverse=True)\n", + "print('Top 100 words across documents in electronics, religion, and motorcycles')\n", + "print('------------------------------------------------------------------------')\n", + "for word, count in word_frequency:\n", + " print(word + ':', count)\n", + "\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "2ZsDoqZy4viG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from inspect import EndOfBlock\n", + " #PART 2: Cluster Documents (Unsupervised Learning) And Discover Topics\n", + "#Step 1 Import all the functions needed\n", + "from operator import index\n", + "import pandas as pd\n", + "from sklearn import datasets\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "import numpy as np\n", + "from pprint import pprint\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.decomposition import NMF\n", + "import matplotlib.pyplot as plt\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.cluster import KMeans\n", + "import nltk\n", + "#Step 2: Test if a word would count as a token\n", + "nltk.download('wordnet')\n", + "nltk.download('omw-1.4')\n", + "\n", + "\n", + "def letters_only(astr):\n", + " return astr.isalpha()\n", + "\n", + "\n", + "\n", + "#Step 3: Strip these words out of the corpus for the given topics and apply lemmatization. \n", + "stripped_words = ['organization', 'article', 'mr', 'know', 'like', 'com', 'edu', 'lines', 'subject', 'university', 'say', 'think']\n", + "clean = []\n", + "lemmatizer = WordNetLemmatizer()\n", + "print('Cleaning the list')\n", + "for post in target_documents:\n", + " clean.append(\" \". join(lemmatizer.lemmatize(word.lower()) for word in post.split() if letters_only(word) and word.lower() not in stripped_words))\n", + "cleaned_words_bag = cv.fit_transform(clean)\n", + "print(cv.get_feature_names)\n", + "\n", + "\n", + "#Step #4: Find the optimal K\n", + "print('Finding the optimal K')\n", + "Sum_of_squared_distance = []\n", + "K = range(1,16)\n", + "for k in K:\n", + " km = KMeans(n_clusters = k)\n", + " km = km.fit(cleaned_words_bag) \n", + " km = Sum_of_squared_distance.append(km.inertia_)\n", + "\n", + "#Step 5: Plotting the K Cluster\n", + "plt.plot(K, Sum_of_squared_distance)\n", + "plt.xlabel('k')\n", + "plt.ylabel(Sum_of_squared_distance)\n", + "plt.title('Elbow Method for the optimal K')\n", + "plt.show()\n", + "\n" + ], + "metadata": { + "id": "HUIvo6Y-Bccy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Step 6: Perform Topic Modeling (Must be ran seperatly from the K-Means Cluster)\n", + "K = 7\n", + "km = KMeans(n_clusters = K)\n", + "km = km.fit(cleaned_words_bag)\n", + "print('iter',km.n_iter_) #<----- Prining the number of iterations\n", + "print('features',km.n_features_in_) #<-------- Prining the number of features seen duirng the fit\n", + "\n", + "\n", + "for t in range (K-1):\n", + " group_indices = np.where(km.labels_ == t ) \n", + " group_docs = [clean[x] for x in group_indices[0]]\n", + " if len(group_indices[0]) > 2:\n", + " fits = cv.fit_transform(group_docs)\n", + "\n", + " print('Group' + str((t + 1))+':')\n", + " nmf = NMF(n_components=3, random_state=50).fit(fits)\n", + " for topic_idx, topic in enumerate(nmf.components_):\n", + " print(topic_idx, ':', ' '.join([cv.get_feature_names()[x] for x in topic.argsort()[:-9:-1]]))\n", + " continue\n" + ], + "metadata": { + "id": "ZRLCXT8vVeFr" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Ethiclal consideration (1).docx b/Ethiclal consideration (1).docx new file mode 100644 index 0000000..03a8df4 Binary files /dev/null and b/Ethiclal consideration (1).docx differ diff --git a/Group 4 Capstone Project Design Document.pdf b/Group 4 Capstone Project Design Document.pdf new file mode 100644 index 0000000..1384a49 Binary files /dev/null and b/Group 4 Capstone Project Design Document.pdf differ diff --git a/README.md b/README.md index 27ae2eb..6db4854 100644 --- a/README.md +++ b/README.md @@ -2,29 +2,76 @@ -# final-project -## [National Action Council for Minorities in Engineering(NACME)](https://www.nacme.org) Google Applied Machine Learning Intensive (AMLI) at the `PARTICIPATING_UNIVERSITY` +# Displaying the Top 100 Words and Clustering Data + +### NACME/AMLI Google Bootcamp + +**NACME** **(National Action Council For Minorities in Engineering)** is an organization committed to assisting underrepresented minorities in engineering and computer science career paths. NACME provides scholarships, opportunities and programs in order to increase the engineering worforce and prepare underrepresented students for the real world. The AMLI Google Bootcamp is one of program that NACME provides that gives college students an introduction to machine learning in order to contend for an entry-level Machine Learning position. Developed by: -- [member1](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY` -- [member2](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY` -- [member3](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY` -- [member4](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY` +- [Brianna Murel](/https://github.com/brmu22) - Morgan State University +- [Jaden Robinson](https://github.com/Jaytheegreat) - Morgan State University +- [Tobi Owolabi](https://github.com/TobiOwolabi) - Morgan State University +- [Laila Amin](https://github.com/nightchild993 "Laila Amin") - Morgan State University + +## Roles + + +**Everyone** +- **Colab File Contributors** : Create an empty colab and apply tactics like EDA (Exploratory Data Analysis), creating a DataFrame, training a model etc. The group is responsible for implementing and detailing code in order to complete the assigned topic for the project. + +**Brianna** +- **PowerPoint Creator** : Manage presentation slides detailing the team members and roles, the reasons for completing the project, the lessons learned when completing the project and the steps moving forward after completing the project. + +**Tobi** +- **Project Lead** : Keep the group on task and oversee that everyone works on their assigned parts of the project. Maintains communication with the team and writes a project report as necessary. + +- **Design Documenter** : Frequently updates the design document on the necessary steps taken to complete the project. Write the intermediate and long term goals of the project. + +**Jaden** +- **Read.md file Creator** : Creates the introductory file used to detail what is NACME/AMLI Google Bootcamp, gives a description of the topic for the project, list the names of the team members, their linked GitHub accounts and their associated University. They also provide the results for their respected projects. + +**Laila** +- **Ethical Considerations Editor** : Write a discussion on the ethics of the project, write paragraphs on how a fictional character was positively and negatively affected by the project model, list possible biases and describe modifications to mitigate bias. ## Description +The premise of the project is to utilize a dataset of 20 News Groups and display the top 100 words by their frquencies(how often they appear). Then, natural language processing is applied to cluster documents in order to investigate the top 3 sets of topics contained in each cluster. Our goal is to conduct Unsupervised Learning by clustering our documents and examine the hidden words within our data. The words included in the documents are stripped by performing lemmatization (process from Natural Language Processing that accurately groups mutiple variations of the same word). Also, K-means clustering will be conducted to group the documents in the News Group by a number of clusters to and display the top 3 sets of topics(a.k.a. Topic Modeling). + +## Natural Language Processing +- NLP is a means to where machines learn and interpret human language in the same manner as humans. Communication with the Iphone's Siri is an example of a use for nlp as when a user speaks into their device, Siri converts the speech to a machine's language and outputs information the user was looking for. +![](https://www.cybiant.com/wp-content/uploads/2020/01/CKC-Natural-Language-Processing.png) + +## Lemmatization +- process of converting words into their root word or lemma. In the example below, "play" is the lemma for all the variations of words. Lemmatization is used in this project to increase the accuracy of printing the top topics associated in each cluster. +![](https://i.pinimg.com/736x/bf/81/be/bf81be81beba989425ea49e8856c952a.jpg) + +## Topic Modeling +![](https://miro.medium.com/max/1400/1*cDwKSHmfp5awjqjobV707g.png) + +## References +https://www.pinterest.com/pin/stemming-and-lemmatization-in-python--540713499008866837/ +https://www.cybiant.com/resources/natural-language-processing/ +https://medium.com/analytics-vidhya/how-to-perform-topic-modeling-using-mallet-abc43916560f ## Usage instructions + 1. Fork this repo 2. Change directories into your project 3. On the command line, type `pip3 install requirements.txt` -4. .... +4. + +## Contacts +- Tobi: toowo1@morgan.edu +- Jaden: jarob66@morgan.edu +- Brianna: brmur12@morgan.edu +- Laila: laami2@morgan.edu +