Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added 2022_AMLI_capstone_template.pptx
Binary file not shown.
Binary file added Capstone Project Group 4 Decision Log.pdf
Binary file not shown.
186 changes: 186 additions & 0 deletions CapstoneProjectGitHubFinish.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "CapstoneProjectGitHubFinish.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
" #PART 1: Display 100 Top Words in Subset of 20 News Groups Data set\n",
"\n",
"#Step 1 Import all the functions needed\n",
"from operator import index\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"import numpy as np\n",
"from pprint import pprint\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import ssl\n",
"\n",
"ssl._create_default_https_context = ssl._create_unverified_context\n",
"\n",
"#Step 2: Import the dataset witht the 20 NewsGroups\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
"\n",
"newsgroups_train = fetch_20newsgroups()\n",
"print(newsgroups_train.keys())\n",
"\n",
"\n",
"FEATURES = newsgroups_train['filenames']\n",
"TARGET = 'titles'\n",
"\n",
"FEATURES, TARGET\n",
"\n",
"#Step 3: Take the Imported Newsgroups and transform the data into a dataframe.\n",
"print(newsgroups_train.keys())\n",
"FEATURES = newsgroups_train['filenames']\n",
"TARGET = 'titles'\n",
"FEATURES, TARGET\n",
"print(len(newsgroups_train['data']))\n",
"df = pd.DataFrame(newsgroups_train['data'])\n",
"df\n",
"df[TARGET] = newsgroups_train['target']\n",
"df\n",
"\n",
"#Step 4: Perform Exploratory Data Analyis on the data\n",
"df.head()\n",
"df.isnull().sum()\n",
"\n",
"#Step 5: Isolate the three needed subsets of the 20 NewsGroups, list them, & target the NewsGroups.\n",
"target_name_index = [newsgroups_train.target_names.index('talk.religion.misc'),newsgroups_train.target_names.index('sci.electronics'), newsgroups_train.target_names.index('rec.motorcycles') ]\n",
"word_index = np.where(newsgroups_train.target == target_name_index[0])[0]\n",
"speak_index = np.where(newsgroups_train.target == target_name_index[1])[0]\n",
"refrence_index = np.where(newsgroups_train.target == target_name_index [2])[0]\n",
"target_index = np.append(np.append(word_index, speak_index), refrence_index)\n",
"target_documents = [newsgroups_train.data[x] for x in target_index]\n",
"\n",
"#Step 6: Display the top 100 Words\n",
"cv = CountVectorizer(stop_words='english', max_features=100)\n",
"words_bag = cv.fit_transform(target_documents)\n",
"sum_of_words = words_bag.sum(axis=0)\n",
"word_frequency = [(word, sum_of_words[0, idx]) for word, idx in cv.vocabulary_.items()]\n",
"word_frequency = sorted(word_frequency, key = lambda x: x[1], reverse=True)\n",
"print('Top 100 words across documents in electronics, religion, and motorcycles')\n",
"print('------------------------------------------------------------------------')\n",
"for word, count in word_frequency:\n",
" print(word + ':', count)\n",
"\n",
"\n",
"\n",
"\n"
],
"metadata": {
"id": "2ZsDoqZy4viG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from inspect import EndOfBlock\n",
" #PART 2: Cluster Documents (Unsupervised Learning) And Discover Topics\n",
"#Step 1 Import all the functions needed\n",
"from operator import index\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"import numpy as np\n",
"from pprint import pprint\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.decomposition import NMF\n",
"import matplotlib.pyplot as plt\n",
"from nltk.stem import WordNetLemmatizer\n",
"from sklearn.cluster import KMeans\n",
"import nltk\n",
"#Step 2: Test if a word would count as a token\n",
"nltk.download('wordnet')\n",
"nltk.download('omw-1.4')\n",
"\n",
"\n",
"def letters_only(astr):\n",
" return astr.isalpha()\n",
"\n",
"\n",
"\n",
"#Step 3: Strip these words out of the corpus for the given topics and apply lemmatization. \n",
"stripped_words = ['organization', 'article', 'mr', 'know', 'like', 'com', 'edu', 'lines', 'subject', 'university', 'say', 'think']\n",
"clean = []\n",
"lemmatizer = WordNetLemmatizer()\n",
"print('Cleaning the list')\n",
"for post in target_documents:\n",
" clean.append(\" \". join(lemmatizer.lemmatize(word.lower()) for word in post.split() if letters_only(word) and word.lower() not in stripped_words))\n",
"cleaned_words_bag = cv.fit_transform(clean)\n",
"print(cv.get_feature_names)\n",
"\n",
"\n",
"#Step #4: Find the optimal K\n",
"print('Finding the optimal K')\n",
"Sum_of_squared_distance = []\n",
"K = range(1,16)\n",
"for k in K:\n",
" km = KMeans(n_clusters = k)\n",
" km = km.fit(cleaned_words_bag) \n",
" km = Sum_of_squared_distance.append(km.inertia_)\n",
"\n",
"#Step 5: Plotting the K Cluster\n",
"plt.plot(K, Sum_of_squared_distance)\n",
"plt.xlabel('k')\n",
"plt.ylabel(Sum_of_squared_distance)\n",
"plt.title('Elbow Method for the optimal K')\n",
"plt.show()\n",
"\n"
],
"metadata": {
"id": "HUIvo6Y-Bccy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#Step 6: Perform Topic Modeling (Must be ran seperatly from the K-Means Cluster)\n",
"K = 7\n",
"km = KMeans(n_clusters = K)\n",
"km = km.fit(cleaned_words_bag)\n",
"print('iter',km.n_iter_) #<----- Prining the number of iterations\n",
"print('features',km.n_features_in_) #<-------- Prining the number of features seen duirng the fit\n",
"\n",
"\n",
"for t in range (K-1):\n",
" group_indices = np.where(km.labels_ == t ) \n",
" group_docs = [clean[x] for x in group_indices[0]]\n",
" if len(group_indices[0]) > 2:\n",
" fits = cv.fit_transform(group_docs)\n",
"\n",
" print('Group' + str((t + 1))+':')\n",
" nmf = NMF(n_components=3, random_state=50).fit(fits)\n",
" for topic_idx, topic in enumerate(nmf.components_):\n",
" print(topic_idx, ':', ' '.join([cv.get_feature_names()[x] for x in topic.argsort()[:-9:-1]]))\n",
" continue\n"
],
"metadata": {
"id": "ZRLCXT8vVeFr"
},
"execution_count": null,
"outputs": []
}
]
}
Binary file added Ethiclal consideration (1).docx
Binary file not shown.
Binary file added Group 4 Capstone Project Design Document.pdf
Binary file not shown.
65 changes: 56 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,76 @@
<!--
Name of your teams' final project
-->
# final-project
## [National Action Council for Minorities in Engineering(NACME)](https://www.nacme.org) Google Applied Machine Learning Intensive (AMLI) at the `PARTICIPATING_UNIVERSITY`
# Displaying the Top 100 Words and Clustering Data


### NACME/AMLI Google Bootcamp

**NACME** **(National Action Council For Minorities in Engineering)** is an organization committed to assisting underrepresented minorities in engineering and computer science career paths. NACME provides scholarships, opportunities and programs in order to increase the engineering worforce and prepare underrepresented students for the real world. The AMLI Google Bootcamp is one of program that NACME provides that gives college students an introduction to machine learning in order to contend for an entry-level Machine Learning position.
<!--
List all of the members who developed the project and
link to each members respective GitHub profile

-->
Developed by:
- [member1](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY`
- [member2](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY`
- [member3](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY`
- [member4](https://github.com/cbaker6) - `STUDENTS_UNIVERSITY`
- [Brianna Murel](/https://github.com/brmu22) - Morgan State University
- [Jaden Robinson](https://github.com/Jaytheegreat) - Morgan State University
- [Tobi Owolabi](https://github.com/TobiOwolabi) - Morgan State University
- [Laila Amin](https://github.com/nightchild993 "Laila Amin") - Morgan State University

## Roles


**Everyone**
- **Colab File Contributors** : Create an empty colab and apply tactics like EDA (Exploratory Data Analysis), creating a DataFrame, training a model etc. The group is responsible for implementing and detailing code in order to complete the assigned topic for the project.

**Brianna**
- **PowerPoint Creator** : Manage presentation slides detailing the team members and roles, the reasons for completing the project, the lessons learned when completing the project and the steps moving forward after completing the project.

**Tobi**
- **Project Lead** : Keep the group on task and oversee that everyone works on their assigned parts of the project. Maintains communication with the team and writes a project report as necessary.

- **Design Documenter** : Frequently updates the design document on the necessary steps taken to complete the project. Write the intermediate and long term goals of the project.

**Jaden**
- **Read.md file Creator** : Creates the introductory file used to detail what is NACME/AMLI Google Bootcamp, gives a description of the topic for the project, list the names of the team members, their linked GitHub accounts and their associated University. They also provide the results for their respected projects.

**Laila**
- **Ethical Considerations Editor** : Write a discussion on the ethics of the project, write paragraphs on how a fictional character was positively and negatively affected by the project model, list possible biases and describe modifications to mitigate bias.

## Description
<!--
Give a short description on what your project accomplishes and what tools is uses. In addition, you can drop screenshots directly into your README file to add them to your README. Take these from your presentations.
-->
The premise of the project is to utilize a dataset of 20 News Groups and display the top 100 words by their frquencies(how often they appear). Then, natural language processing is applied to cluster documents in order to investigate the top 3 sets of topics contained in each cluster. Our goal is to conduct Unsupervised Learning by clustering our documents and examine the hidden words within our data. The words included in the documents are stripped by performing lemmatization (process from Natural Language Processing that accurately groups mutiple variations of the same word). Also, K-means clustering will be conducted to group the documents in the News Group by a number of clusters to and display the top 3 sets of topics(a.k.a. Topic Modeling).

## Natural Language Processing
- NLP is a means to where machines learn and interpret human language in the same manner as humans. Communication with the Iphone's Siri is an example of a use for nlp as when a user speaks into their device, Siri converts the speech to a machine's language and outputs information the user was looking for.
![](https://www.cybiant.com/wp-content/uploads/2020/01/CKC-Natural-Language-Processing.png)

## Lemmatization
- process of converting words into their root word or lemma. In the example below, "play" is the lemma for all the variations of words. Lemmatization is used in this project to increase the accuracy of printing the top topics associated in each cluster.
![](https://i.pinimg.com/736x/bf/81/be/bf81be81beba989425ea49e8856c952a.jpg)

## Topic Modeling

![](https://miro.medium.com/max/1400/1*cDwKSHmfp5awjqjobV707g.png)

## References
https://www.pinterest.com/pin/stemming-and-lemmatization-in-python--540713499008866837/
https://www.cybiant.com/resources/natural-language-processing/
https://medium.com/analytics-vidhya/how-to-perform-topic-modeling-using-mallet-abc43916560f
## Usage instructions

<!--
Give details on how to install fork and install your project. You can get all of the python dependencies for your project by typing `pip3 freeze requirements.txt` on the system that runs your project. Add the generated `requirements.txt` to this repo.
-->
1. Fork this repo
2. Change directories into your project
3. On the command line, type `pip3 install requirements.txt`
4. ....
4.

## Contacts
- Tobi: [email protected]
- Jaden: [email protected]
- Brianna: [email protected]
- Laila: [email protected]