diff --git a/services/parser-database/app.py b/services/parser-database/app.py index 1535330..35c42af 100644 --- a/services/parser-database/app.py +++ b/services/parser-database/app.py @@ -7,8 +7,8 @@ from flask import jsonify # pylint: disable=import-error from parser import parse_all_documents -from connector import get_articles_that_match_keywords -from connector import get_article_by_number +from connector import get_article_by_id_db +from connector import get_articles_that_match_keywords_db app = Flask(__name__) app.config['JSON_AS_ASCII'] = False @@ -32,14 +32,14 @@ def get_keywords(): logging.error(error) return error, 400 else: - return jsonify(get_articles_that_match_keywords(json_request['keywords'])) + return jsonify(get_articles_that_match_keywords_db(json_request['keywords'])) @app.route('/articles/', methods=['GET']) def get_article_by_number_in_memory(id): """Returns the article that matches the ID value accoring to the apiSpec.yaml file""" - article = get_article_by_number(str(id)) + article = get_article_by_id_db(str(id)) if article is not None: article = copy(article) return jsonify(article) @@ -50,5 +50,5 @@ def get_article_by_number_in_memory(id): if __name__ == '__main__': - parse_all_documents() + #parse_all_documents() app.run(debug=True, host='0.0.0.0', port=os.getenv("PORT")) diff --git a/services/parser-database/connector.py b/services/parser-database/connector.py index 6486ceb..0f8a219 100644 --- a/services/parser-database/connector.py +++ b/services/parser-database/connector.py @@ -2,15 +2,56 @@ services/databases""" import requests # pylint: disable=import-error import logging +import numpy as np # pylint: disable=import-error +import firebase_admin +from firebase_admin import credentials +from firebase_admin import firestore import constants import env logging.basicConfig(level=logging.INFO) +class Keywords: + """Class for storing articles. + """ + def __init__(self, keyword): + self.keyword = keyword + self.articles_that_contain_keyword = {} + + def to_dict(self): + article_dict = { + "keyword": self.number, + "contain": self.articles_that_contain_keyword, + } + return article_dict + + @staticmethod + def from_dict(src): + self.number = src["number"] + self.id = src["id"] + self.content = src["keyword"] + self.wordCount = src["frequency"] + +cred = credentials.ApplicationDefault() +firebase_admin.initialize_app(cred, { + 'projectId': 'major-tom-285619', +}) + +db = firestore.client() + articles_in_memory = {} keywords_in_memory = {} +def get_documents_to_parse_db(): + documents_ref = db.collection(u'documents') + docs = documents_ref.stream() + + for doc in docs: + print(f'{doc.id} => {doc.to_dict()}') + return docs + + def get_documents_to_parse(): # When database is integrated, this will go away document_list = [] @@ -61,6 +102,34 @@ def get_articles_that_match_keywords(keywords_list): return matching_articles +def get_articles_by_tfidf_value(keywords_list): + """ + Returns a value for every article based on a keyword + for a keyword list, value is based on + term frequency inverse document frequency (tfidf) + Args: + keywords_list (list): Keyword(s) to look for + + Returns: + list: articles and value for such keyword(s) + """ + matching_articles = {} + for keyword in keywords_list: + articles_that_match_keyword = {} + if keyword in keywords_in_memory: + for article in keywords_in_memory[keyword]: + # tfidf computation + word_count = articles_in_memory[str(article["number"])]["wordCount"] + term_density_in_article = article["frequency"]/word_count + document_frequency = len(articles_in_memory)/len(keywords_in_memory[keyword]) + inverse_doc_freq = np.log(document_frequency) + weight = term_density_in_article * inverse_doc_freq + + articles_that_match_keyword[str(article["number"])] = {"weight": weight} + matching_articles[keyword] = articles_that_match_keyword + return matching_articles + + def save_keywords_in_memory(keywords, article): """Saves the keywords from an article in memory @@ -83,3 +152,56 @@ def store_article(article_dict): articles_in_memory[article_dict["id"]] = article_dict save_keywords_in_memory(get_keywords(article_dict["content"]), article_dict) logging.info('Article ' + article_dict["id"] + ' assigned keywords') + + +def store_article_in_db(article_dict): + db.collection(u'articles').document(article_dict["id"]).set(article_dict) + save_keywords_in_db(get_keywords(article_dict["content"]), article_dict) + logging.info('Article ' + article_dict["id"] + ' assigned keywords') + + +def save_keywords_in_db(keywords, article): + """Saves the keywords from an article in memory + + Args: + keywords (JSON): contains keywords + article (Article): article object + """ + for keyword in keywords: + frequency = article["content"].count(keyword) + + doc_ref = db.collection(u'keywords').where('keyword', '==', keyword) + doc = doc_ref.get() + + if len(doc) != 0 and doc[0] is not None: + from_db = doc[0].to_dict() + print(from_db) + from_db["matching_articles"][article["id"]] = frequency + #print(from_db) + db.collection(u'keywords').document(doc[0].id).set(from_db) + else: + to_send = {"keyword": keyword, "matching_articles": {article["id"]: frequency}} + db.collection(u'keywords').add(to_send) + + +def get_articles_that_match_keywords_db(keywords_list): + matching_articles = {} + for keyword in keywords_list: + articles_that_match_keyword = {} + doc_ref = db.collection(u'keywords').where('keyword', '==', keyword) + doc = doc_ref.get() + if doc.exists(): + doc_dict = doc.to_dict() + for article in doc_dict[keyword]: + articles_that_match_keyword[str(article["id"])] = {"weight": article["frequency"]} + matching_articles[keyword] = articles_that_match_keyword + return matching_articles + + +def get_article_by_id_db(art_num): + documents_ref = db.collection(u'articles').document(art_num) + doc = documents_ref.get() + if doc is not None: + return doc.to_dict() + else: + return None \ No newline at end of file diff --git a/services/parser-database/parser.py b/services/parser-database/parser.py index e35f3ce..a506bf2 100644 --- a/services/parser-database/parser.py +++ b/services/parser-database/parser.py @@ -38,7 +38,7 @@ class Article: def __init__(self, number, content): self.number = number self.content = content - self.id = str(number) + self.id = 'monterrey'+str(number) def to_dict(self): article_dict = { @@ -49,6 +49,13 @@ def to_dict(self): } return article_dict + @staticmethod + def from_dict(src): + self.number = src["number"] + self.id = src["id"] + self.content = src["content"] + self.wordCount = src["wordCount"] + def identify_articles(pdf_text): """Identifies articles and returns a list of Article objects. @@ -113,4 +120,6 @@ def parse(document_to_parse): for article in articles: dictionary = article.to_dict() - connector.store_article(dictionary) + connector.store_article_in_db(dictionary) + +#parse_all_documents() \ No newline at end of file diff --git a/services/parser-database/requirements.txt b/services/parser-database/requirements.txt index a92fbc7..415c0ba 100644 --- a/services/parser-database/requirements.txt +++ b/services/parser-database/requirements.txt @@ -2,4 +2,5 @@ Flask utils https://github.com/timClicks/slate/archive/master.zip requests -pytest-mock +numpy +pytest-mock \ No newline at end of file diff --git a/services/parser-database/tests/test_connector.py b/services/parser-database/tests/test_connector.py index 5f25996..e0a23e7 100644 --- a/services/parser-database/tests/test_connector.py +++ b/services/parser-database/tests/test_connector.py @@ -35,6 +35,35 @@ } +in_memory_value_mock_no_decimals = { + "ciclista": [ + {"number": 5, "frequency": 3}, + {"number": 45, "frequency": 6}, + {"number": 99, "frequency": 9}, + ], + "licencia": [ + {"number": 89, "frequency": 3}, + {"number": 45, "frequency": 3}, + {"number": 125, "frequency": 15}, + ], +} + + +articles_in_memory = {'5': {'wordCount': 32}, '45': {'wordCount': 40}, '89': {'wordCount': 16}, + '99': {'wordCount': 50}, '125': {'wordCount': 200}} + + +articles_in_memory_no_wordCount = {'5': {}, '45': {}, '89': {}, '99': {}, '125': {}} + + +def logn(num): + """ + Mocks the natural log of a number to try to + minimize decimal points + """ + return num + + @mock.patch("connector.keywords_in_memory", in_memory_value_mock) def test_get_articles_that_match_keywords_empty_result_one_keyword(): result_to_assert_1 = {"alcohol": {}} @@ -71,6 +100,39 @@ def test_get_articles_that_match_keywords_non_empty_result_two_keywords(): assert result == result_to_assert_4 +@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals) +@mock.patch("connector.articles_in_memory", articles_in_memory) +def test_get_articles_by_tfidf_value(): + expected = { + "licencia": {"89": {"weight": .3125}, "45": {"weight": .125}, "125": {"weight": .125}}, + "ciclista": {"5": {"weight": .15625}, "45": {"weight": .25}, "99": {"weight": .3}}, + } + keywords = ["licencia", "ciclista"] + with mock.patch("numpy.log", side_effect=logn): + assert expected == connector.get_articles_by_tfidf_value(keywords) + + +@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals) +@mock.patch("connector.articles_in_memory", articles_in_memory) +def test_get_articles_by_tfidf_value_empty_result(): + expected = { + "casco": {}, + "luz": {}, + } + keywords = ["casco", "luz"] + with mock.patch("numpy.log", side_effect=logn): + assert expected == connector.get_articles_by_tfidf_value(keywords) + + +@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals) +@mock.patch("connector.articles_in_memory", articles_in_memory_no_wordCount) +def test_get_articles_by_tfidf_value_missing_word_count(): + keywords = ["licencia", "ciclista"] + with mock.patch("numpy.log", side_effect=logn): + with pytest.raises(KeyError): + connector.get_articles_by_tfidf_value(keywords) + + def test_get_documents(): assert connector.get_documents_to_parse() == [constants.mty_document]