diff --git a/.github/workflows/dev_test_publish.yml b/.github/workflows/dev_test_publish.yml index 0341e38..1a9abdb 100644 --- a/.github/workflows/dev_test_publish.yml +++ b/.github/workflows/dev_test_publish.yml @@ -20,69 +20,110 @@ env: jobs: - test: - name: Test - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - ref: 'dev' + # test: + # name: Test 🧪 + # runs-on: ubuntu-latest + + # steps: + # - name: Checkout code + # uses: actions/checkout@v4 + # with: + # ref: 'dev' - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.9' + # - name: Set up Python + # uses: actions/setup-python@v5 + # with: + # python-version: '3.10.9' - - name: Install latest PIP - run: | - python -m pip install --upgrade pip >> $GITHUB_STEP_SUMMARY + # - name: Install latest PIP + # run: | + # python -m pip install --upgrade pip - - name: Install Dependencies - run: | - python -m pip install pytest pytest-cov >> $GITHUB_STEP_SUMMARY + # - name: Install Dependencies + # run: | + # python -m pip install pytest pytest-cov - - name: Setup factiva-analytics (this repo) - run: | - python -m pip install -e . >> $GITHUB_STEP_SUMMARY + # - name: Setup factiva-analytics (this repo) + # run: | + # python -m pip install . - - name: pytest - run: pytest test/ >> $GITHUB_STEP_SUMMARY + # - name: pytest + # run: pytest test/ build: - name: Build and Publish + name: Build 📦 runs-on: ubuntu-latest - needs: [test] + # needs: [test] + permissions: + id-token: write steps: - name : Checkout code - uses : actions/checkout@v3 + uses : actions/checkout@v4 with: ref: 'dev' - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.10.9' - - name: Install latest pip, setuptools, twine + wheel + # changes + + - name: Install pypa/build run: | - python -m pip install --upgrade pip setuptools wheel >> $GITHUB_STEP_SUMMARY - - - name: Build wheels + python -m pip install --upgrade build + - name: Build a binary build and a source tarball run: | - python setup.py bdist_wheel >> $GITHUB_STEP_SUMMARY - python setup.py sdist >> $GITHUB_STEP_SUMMARY - - - name: Upload Artifact - uses: actions/upload-artifact@v3 + python -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 with: - name: Wheel_library + name: python-package-distributions path: dist/ + + # - name: Install latest pip, setuptools, twine + wheel + # run: | + # python -m pip install --upgrade pip setuptools wheel >> $GITHUB_STEP_SUMMARY - - name: Publish package to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ - skip_existing: true + # - name: Build wheels + # run: | + # python setup.py bdist_wheel >> $GITHUB_STEP_SUMMARY + # python setup.py sdist >> $GITHUB_STEP_SUMMARY + + + publish-to-testpypi: + name: Publish 📦 to TestPyPI + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/factiva-analytics + + permissions: + id-token: write + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish 📦 to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + verbose: true + skip-existing: true + + # - name: GitHub Repo Artifact Upload + # uses: actions/upload-artifact@v4 + # with: + # name: Wheel_library + # path: dist/ + + # - name: Publish package to TestPyPI + # uses: pypa/gh-action-pypi-publish@release/v1 + # with: + # repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/main_test_publish.yml b/.github/workflows/main_test_publish.yml index 9633001..684948b 100644 --- a/.github/workflows/main_test_publish.yml +++ b/.github/workflows/main_test_publish.yml @@ -25,12 +25,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v3 with: - python-version: '3.9' + python-version: '3.10.9' - name: Install latest PIP run: | @@ -53,12 +53,12 @@ jobs: needs: [test] steps: - name : Checkout code - uses : actions/checkout@v3 + uses : actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v3 with: - python-version: '3.9' + python-version: '3.10.9' - name: Install latest pip, setuptools, twine + wheel run: | @@ -80,4 +80,4 @@ jobs: with: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} - skip_existing: false + skip-existing: false diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 1cae6b2..546c6dc 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -2,26 +2,23 @@ # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details -# Required version: 2 -# Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: - python: "3.9" + python: "3.12" -# Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/source/conf.py + configuration: docs/source/conf.py -# Optionally build your docs in additional formats such as PDF formats: - - pdf + - pdf + - epub -# Optionally set the version of Python and requirements required to build your docs python: install: - requirements: docs/requirements.txt - method: pip path: . + diff --git a/README.rst b/README.rst index c28c192..aa91f53 100755 --- a/README.rst +++ b/README.rst @@ -1,6 +1,7 @@ Dow Jones Factiva Analytics Python Library ########################################## .. image:: https://github.com/dowjones/factiva-analytics-python/actions/workflows/main_test_publish.yml/badge.svg +.. image:: https://readthedocs.org/projects/factiva-analytics-python/badge/?version=latest&style=plastic This library simplifies the integration to Factiva Analytics API services that delivers premium news content. @@ -10,6 +11,7 @@ The following services are currently implemented. * **Snapshots**: Allows to run each snapshot creation, monitoring, download and local exploration, in an individual manner. Also allows to run the whole process within a single method. * **Streams**: In addition to creating and getting stream details, contains the methods to easily implement a stream listener and push the content to other locations appropriate for high-available setups. * **Taxonomy**: Operations that return taxonomies applied to classify news content. +* **ArticleFetcher**: Gets article's content by unique identifiers (AN), for display purposes only. Installation ============ @@ -23,68 +25,95 @@ Using Library services ====================== Most Factiva Analytics services are implemented in this library. There may be a delay (commonly weeks) when new features are released and their operations are implemented in this package. -Creating a User Instance and Getting its statistics ---------------------------------------------------- -Create `UserKey` instance and retrieve a summary of the account statistics. +Getting Account Information +--------------------------- +Create an `AccountInfo` instance that contains a summary of the account's basic information and usage statistics. .. code-block:: python - from factiva.analytics import UserKey - u = UserKey( - key='abcd1234abcd1234abcd1234abcd1234', # Not needed if the ENV variable FACTIVA_USERKEY is set - stats=True) # Connects to the API and pulls the latest account status + from factiva.analytics import AccountInfo + u = AccountInfo( + user_key='abcd1234abcd1234abcd1234abcd1234' # Not needed if the ENV variable FACTIVA_USERKEY is set + ) print(u) .. code-block:: - - |-key = ****************************1234 - |-cloud_token = **Not Fetched** - |-account_name = AccName1234 - |-account_type = account_with_contract_limits - |-active_products = DNA - |-max_allowed_concurrent_extractions = 5 - |-max_allowed_extracted_documents = 200,000 - |-max_allowed_extractions = 3 - |-currently_running_extractions = 0 - |-total_downloaded_bytes = 7,253,890 - |-total_extracted_documents = 2,515 - |-total_extractions = 1 - |-total_stream_instances = 4 - |-total_stream_subscriptions = 1 - |-enabled_company_identifiers = [{'id': 4, 'name': 'isin'}, {'id': 3, 'name': 'cusip'}, {'id': 1, 'name': 'sedol'}, {'id': 5, 'name': 'ticker_exchange'}] - |-remaining_documents = 197,485 - |-remaining_extractions = 2 - -Snapshots ---------- + <'factiva.analytics.AccountInfo'> + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************1234 + │ └─cloud_token: **********************YKB12sJrkHXX + ├─account_name: AccName1234 + ├─account_type: account_with_contract_limits + ├─active_product: DNA + ├─max_allowed_extracted_documents: 8,000,000 + ├─max_allowed_extractions: 20 + ├─currently_running_extractions: 0 + ├─total_extracted_documents: 5,493,078 + ├─total_extractions: 4 + ├─total_stream_instances: 0 + ├─total_stream_subscriptions: 0 + ├─extractions_list: + ├─streams_list: + ├─enabled_company_identifiers: + │ ├─[1]: sedol + │ ├─[3]: cusip + │ ├─[4]: isin + │ └─[5]: ticker_exchange + ├─remaining_documents: 2,506,922 + └─remaining_extractions: 16 + + +Snapshot Explain +---------------- +Creates an API request that tests the query and returns the number of matching items in the archive. + +.. code-block:: python + + from factiva.analytics import SnapshotExplain + my_query = "publication_datetime >= '2023-01-01 00:00:00' AND UPPER(source_code) = 'DJDN'" + my_explain = SnapshotExplain( + user_key='abcd1234abcd1234abcd1234abcd1234', # Not needed if the ENV variable FACTIVA_USERKEY is set + query=my_query) + my_explain.process_job() # This operation can take several seconds to complete + print(my_explain) + +.. code-block:: + + <'factiva.analytics.SnapshotExplain'> + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************1234 + │ └─cloud_token: **********************YKB12sJrkHXX + ├─query: <'factiva.analytics.SnapshotExplainQuery'> + │ ├─where: publication_datetime >= '2023-01-01 00:00:00' AND UPPER(source_code) = 'DJDN' + │ ├─includes: + │ ├─excludes: + │ ├─include_lists: + │ └─exclude_lists: + ├─job_response: <'factiva.analytics.SnapshotExplainJobResponse'> + │ ├─job_id: 3ee35a80-0406-4f2b-a999-3e4eb5aa94d8 + │ ├─job_link: https://api.dowjones...8/_explain + │ ├─job_state: JOB_STATE_DONE + │ ├─volume_estimate: 2,482,057 + │ └─errors: + └─samples: + + +Snapshot Extraction +------------------- Create a new snapshot and download to a local repository just require a few lines of code. .. code-block:: python - from factiva.analytics import Snapshot - my_query = "publication_datetime >= '2020-01-01 00:00:00' AND LOWER(language_code) = 'en'" - my_snapshot = Snapshot( + from factiva.analytics import SnapshotExtraction + my_query = "publication_datetime >= '2023-01-01 00:00:00' AND UPPER(source_code) = 'DJDN'" + my_snapshot = SnapshotExtraction( user_key='abcd1234abcd1234abcd1234abcd1234', # Can be ommited if exist as env variable query=my_query) - my_snapshot.process_extract() # This operation can take several minutes to complete + my_snapshot.process_job() # This operation can take several minutes to complete After the process completes, the output files are stored in a subfolder named as the Extraction Job ID. In the previous code a new snapshot is created using my_query as selection criteria and user_key for user authentication. After the job is being validated internally, a Snapshot Id is obtained along with the list of files to download. Files are automatically downloaded to a folder named equal to the snapshot ID, and contents are loaded as a Pandas DataFrame to the variable news_articles. This process may take several minutes, but automates the extraction process significantly. -Streams -------- -Create a stream instance and get the details to configure the stream client and listen the content as it is delivered. - -.. code-block:: python - - from factiva.analytics import Stream - stream_query = Stream( - user_key='abcd1234abcd1234abcd1234abcd1234', # Can be ommited if exist as env variable - user_key_stats=True, - query="publication_datetime >= '2021-04-01 00:00:00' AND LOWER(language_code)='en' AND UPPER(source_code) = 'DJDN'", - ) - - print(stream_query.create()) diff --git a/docs/source/concepts/articlefetch.rst b/docs/source/concepts/articlefetch.rst new file mode 100644 index 0000000..61054e8 --- /dev/null +++ b/docs/source/concepts/articlefetch.rst @@ -0,0 +1,4 @@ +Article Fetch +============= + +ArticleFetch operations tutorial diff --git a/docs/source/concepts/articleretrieval.rst b/docs/source/concepts/articleretrieval.rst deleted file mode 100644 index db51a82..0000000 --- a/docs/source/concepts/articleretrieval.rst +++ /dev/null @@ -1,4 +0,0 @@ -Article Retrieval -================= - -Article Retrieval operations tutorial diff --git a/docs/source/concepts/auth.rst b/docs/source/concepts/auth.rst index b114246..2a4cedb 100644 --- a/docs/source/concepts/auth.rst +++ b/docs/source/concepts/auth.rst @@ -1,15 +1,62 @@ +.. _concepts_auth: + Authentication ============== -Factiva Analytics and the Article Retrieval Service use two different methods of authentication: ``UserKey`` and ``OAuthUser``. +Depending on thte service intended to be used, an operation object will need either a +``UserKey`` or ``OAuthUser`` instance. As a best practice, it is recommended to use +ENV variables to store values to instantiate these objects (see Getting Started +> Environment Variables > :ref:`gettingstarted_envvariables_auth`). + +However, most class constructors look automatically for relevant environment variables +and perform the authentication automatically. + +If an explicit user creation is needed, the following classes can be used: UserKey ------- -Used by all services except the Article Retrieval Service. +Used by all services except the Article Fetcher Service. Usually it's not required to +be instantiated independently as the creation of a `parent` object will get the value +from the environment. + +If using this class explicitly, the following code snippet can be helpful: + +.. code-block:: python + + from factiva.analytics import UserKey, SnapshotExplain + u = UserKey('abcd1234abcd1234abcd1234abcd1234') + se = SnapshotExplain(user_key=u) + +When using ENV variables, the above snippet requires only the constructor call from the parent +class, in this case `SnapshotExplain()`. +.. code-block:: python + + from factiva.analytics import SnapshotExplain + ar = SnapshotExplain() OAuthUser --------- -Used by the Article Retrieval Service only. +Used by the Article Fetcher Service only. Like ``UserKey``, it is usually not required +to be instantiated independently. However, below code snippets can be helpful when using this +class explicitly: + +.. code-block:: python + + from factiva.analytics import OAuthUser, ArticleRetrieval + c_id = "0abcd1wxyz2abcd3wxyz4abcd5wxyz6o" + uname = "9ZZZ000000-svcaccount@dowjones.com" + pwd = "pa55WOrdpa55WOrd" + ou = OAuthUser(client_id=c_id, username=uname, password=pwd) + ar = ArticleFetcher(oauth_user=ou) + ... + +When using ENV variables, the above snippet requires only the constructor call from the parent +class, in this case `ArticleFetcher()`. + +.. code-block:: python + + from factiva.analytics import ArticleFetcher + ar = ArticleFetcher() diff --git a/docs/source/concepts/volumeestimates.rst b/docs/source/concepts/volumeestimates.rst index ebe90b9..7741356 100644 --- a/docs/source/concepts/volumeestimates.rst +++ b/docs/source/concepts/volumeestimates.rst @@ -1,4 +1,46 @@ Volume Estimates ================ -Volume Estimate operations tutorial +Accurate volume estimates are based on the Snapshot Explain operation. This +operation returns the exact number of matching articles in the archive. + +.. code-block:: python + + from factiva.analytics import SnapshotExplain + where_str = "publication_datetime >= '2020-01-01' AND language_code = 'en' AND REGEXP_CONTAINS(industry_codes, r'(?i)(^|,)(i1|i25121|i2567)($|,)')" + se = SnapshotExplain(query=where_str) + se.process_job() + print(f"The query matches {se.job_results.volume_estimate} articles") + + +.. code-block:: + + The query matches 123456 articles + +Using the same Snapshot Explain object, you can also get metadata samples. + +.. code-block:: python + + se.get_samples() + print(se.samples.data[['word_count', 'title', 'source_code']]) + +The object ``se.samples.data`` is a pandas DataFrame. + +.. code-block:: + + word_count title source_code + 0 110 Maire Tecnimont shares gain after India contract SOLRADIN + 1 147 Poste Italiane begins offering electricity, ga... SOLRADIN + 2 219 Constellation Energy inks PPA with Microsoft f... SOLRADIN + 3 25 EDF now sees Hinkley Point C IRR 7.1-7.2% vs 7... SOLRADIN + 4 131 Derivatives stock options: summary by title SOLRADIN + .. ... ... ... + 95 775 Atlantic Power and Infrastructure Installs Tre... ACWIRE + 96 249 Quebec Precious Metals Corporation Announces R... ACWIRE + 97 503 Shareholders that lost money on Plug Power Inc... ACWIRE + 98 1572 Tenth Avenue Petroleum Announces Third Quarter... ACWIRE + 99 187 Challenging Ourselves To Lead in Sustainable E... ACWIRE + +When volume estimates are in line with your expectations, you can proceed to analyze +the data using the Snapshot TimeSeries operation, or directly extract the content via +the Snapshot Extract operation. \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index d62db7e..f007053 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,11 +13,11 @@ import os import sys from datetime import date -from factiva.analytics.__version__ import __version__ + sys.path.insert(0, os.path.abspath('../../src')) sys.path.insert(0, os.path.abspath('../../src/factiva')) sys.path.insert(0, os.path.abspath('../../src/factiva/analytics')) -sys.path.insert(0, os.path.abspath('../../src/factiva/analytics/article_retrieval')) +sys.path.insert(0, os.path.abspath('../../src/factiva/analytics/article_fetcher')) sys.path.insert(0, os.path.abspath('../../src/factiva/analytics/auth')) sys.path.insert(0, os.path.abspath('../../src/factiva/analytics/common')) sys.path.insert(0, os.path.abspath('../../src/factiva/analytics/taxonomy')) @@ -29,10 +29,11 @@ current_year = date.today().year project = 'Factiva Analytics' -copyright = f'{current_year}, Dow Jones' +copyright = f"{current_year}, Dow Jones" author = 'Dow Jones - Customer Solutions Engineering (CSE) Team' # The full version, including alpha/beta/rc tags +from factiva.analytics.__version__ import __version__ release = __version__ version = __version__ @@ -81,18 +82,17 @@ html_title = 'Factiva Analytics - Python Library' html_theme_options = { - 'announcement': 'This library is under development!', -} - -html_theme_options['footer_icons'] = [ - { - 'name': 'GitHub', - 'url': 'https://github.com/dowjones/factiva-analytics-python', - 'html': """ + # "announcement": "This library is under development!", + "footer_icons": [ + { + "name": "GitHub", + "url": "https://github.com/dowjones/factiva-analytics-python", + "html": """ """, - 'class': 'fa-solid fa-github fa-2x', - }, -] + "class": "fa-solid fa-github fa-2x", + } + ], +} diff --git a/docs/source/factiva.analytics/article_retrieval.rst b/docs/source/factiva.analytics/article_fetcher.rst similarity index 58% rename from docs/source/factiva.analytics/article_retrieval.rst rename to docs/source/factiva.analytics/article_fetcher.rst index 984e86a..02fc157 100644 --- a/docs/source/factiva.analytics/article_retrieval.rst +++ b/docs/source/factiva.analytics/article_fetcher.rst @@ -1,18 +1,18 @@ -Article Retrieval Service -######################### +Article API Service +################### When enabled along with all other Factiva Analytics APIs, this service allows to retrieve content for display purposes when end users need to read the underlying content from a calculated score or derived datapoint after processing Snapshots or Streams news articles. -ArticleRetrieval +ArticleFetcher ---------------- -.. autoclass:: factiva.analytics.article_retrieval.article_retrieval.ArticleRetrieval +.. autoclass:: factiva.analytics.article_fetcher.article_fetcher.ArticleFetcher :members: UIArticle --------- -.. autoclass:: factiva.analytics.article_retrieval.article_retrieval.UIArticle +.. autoclass:: factiva.analytics.article_fetcher.article_fetcher.UIArticle :members: diff --git a/docs/source/factiva.analytics/auth.rst b/docs/source/factiva.analytics/auth.rst index 40a414a..df08c7a 100644 --- a/docs/source/factiva.analytics/auth.rst +++ b/docs/source/factiva.analytics/auth.rst @@ -1,9 +1,16 @@ -Authentication Service +Authentication Classes ###################### Module part of the core componenets for the Factiva Analytics python package. Contains classes and tools that allow to interact with the authentication and authorization elements of the Factiva Analytics API. +AccountInfo +----------- + +.. autoclass:: factiva.analytics.auth.accountinfo.AccountInfo + :members: + :member-order: groupwise + UserKey ------- diff --git a/docs/source/gettingstarted/commonops.rst b/docs/source/gettingstarted/commonops.rst index 8a219b5..bd1a033 100644 --- a/docs/source/gettingstarted/commonops.rst +++ b/docs/source/gettingstarted/commonops.rst @@ -9,31 +9,35 @@ Assumes the ENV variable ``FACTIVA_USERKEY`` is set. .. code-block:: python - from factiva.analytics import UserKey - u = UserKey(stats=True) + from factiva.analytics import AccountInfo + u = AccountInfo() print(u) .. code-block:: - - ├─key: ****************************1234 - ├─cloud_token: - ├─account_name: - ├─account_type: - ├─active_product: - ├─max_allowed_concurrent_extractions: 0 - ├─max_allowed_extracted_documents: 0 - ├─max_allowed_extractions: 0 + <'factiva.analytics.AccountInfo'> + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************1234 + │ └─cloud_token: **********************YKB22sJCkHXX + ├─account_name: AccountName + ├─account_type: account_with_contract_limits + ├─active_product: DNA + ├─max_allowed_concurrent_extractions: 1 + ├─max_allowed_extracted_documents: 2,200,000 + ├─max_allowed_extractions: 10 ├─currently_running_extractions: 0 - ├─total_downloaded_bytes: 0 - ├─total_extracted_documents: 0 - ├─total_extractions: 0 - ├─total_stream_instances: 0 - ├─total_stream_subscriptions: 0 + ├─total_downloaded_bytes: 84,195,293 + ├─total_extracted_documents: 145,605 + ├─total_extractions: 3 + ├─total_stream_instances: 2 + ├─total_stream_subscriptions: 2 ├─enabled_company_identifiers: - │ └─ - ├─remaining_documents: 0 - └─remaining_extractions: 0 + │ ├─[1]: sedol + │ ├─[3]: cusip + │ ├─[4]: isin + │ └─[5]: ticker_exchange + ├─remaining_documents: 2.054,395 + └─remaining_extractions: 7 Get Account's Historical Full Extractions @@ -43,11 +47,12 @@ Uses the passed ``key`` parameter and ignores the ENV variable ``FACTIVA_USERKEY .. code-block:: python - from factiva.analytics import UserKey - u = UserKey(key='abcd1234abcd1234abcd1234abcd1234') + from factiva.analytics import AccountInfo + u = AccountInfo(key='abcd1234abcd1234abcd1234abcd1234') extractions = u.get_extractions() -The variable ``extractions`` will contain a ``pandas.DataFrame`` instance with the requested data. +The variable ``extractions`` will contain a Python ``list`` of ``SnapshotExtraction`` +objects. Get Volume Estimates With Snapshot Explain @@ -60,7 +65,89 @@ Assumes the ENV variable ``FACTIVA_USERKEY`` is set. from factiva.analytics import SnapshotExplain my_query = "publication_datetime >= '2020-01-01 00:00:00' AND LOWER(language_code) = 'en'" my_explain = SnapshotExplain(query=my_query) - my_explain.process_explain() # This operation can take several minutes to complete + my_explain.process_job() # This operation can take a few minutes to complete print(my_explain) -After its execution, the object `last_explain_job` contains details about the job itself and the estimated volume. +.. code-block:: + + <'factiva.analytics.SnapshotExplain'> + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************1234 + │ └─cloud_token: **********************YKB22sJCkHXX + ├─query: <'factiva.analytics.SnapshotExplainQuery'> + │ ├─where: publication_datetime >= '2023-01-01 00:00:00' AND UPPER(source_code) = 'DJDN' + │ ├─includes: + │ ├─excludes: + │ ├─include_lists: + │ └─exclude_lists: + ├─job_response: <'factiva.analytics.SnapshotExplainJobResponse'> + │ ├─job_id: 648075e7-b551-4bdb-b8f4-ed7f470ae6bd + │ ├─job_link: https://api.dowjones.com/alpha/extractions/documents/648075e7-b551-4bdb-b8f4-ed7f470ae6bd/_explain + │ ├─job_state: JOB_STATE_DONE + │ ├─volume_estimate: 203,338 + │ └─errors: + └─samples: + +After its execution, the object ``my_explain.job_results`` contains details about the job itself and the estimated volume. + + +Get Extraction Details and Download Files +----------------------------------------- + +Uses the passed ``key`` parameter and ignores the ENV variable ``FACTIVA_USERKEY``. + +.. code-block:: python + + from factiva.analytics import SnapshotExtraction + se = SnapshotExtraction('zmhsvx20tl') + print(se) + +.. code-block:: + + + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************1234 + │ └─cloud_token: **********************YKB22sJCkHXX + ├─query: + └─job_response: + ├─job_id: dj-synhub-extraction-abcd1234abcd1234abcd1234abcd1234-zmhsvx20tl + ├─job_link: https://api.dowjones.com/alpha/extractions/documents/dj-synhub-extraction-abcd1234abcd1234abcd1234abcd1234-zmhsvx20tl + ├─job_state: JOB_STATE_DONE + ├─short_id: zmhsvx20tl + ├─files: - [1] elements + └─errors: + +.. code-block:: + + se.download_files() + +When the operation ends, files will be available in the local folder named as the ``short_id`` attribute (``zmhsvx20tl``). + + +Create a Streaming Instance +------------------------------------------ + +Assumes the ENV variable ``FACTIVA_USERKEY`` is set. + +.. code-block:: python + + from factiva.analytics import StreamingInstance + my_query = "publication_datetime >= '2020-01-01 00:00:00' AND LOWER(language_code) = 'en'" + my_stream = StreamingInstance(query=my_query) + my_stream.create() + print(my_stream) + +.. code-block:: + + <'factiva.analytics.StreamingInstance'> + ├─id: + ├─short_id: 4doq2zigpf + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************1234 + │ └─cloud_token: **********************YKB22sJCkHXX + ├─query: "publication_datetime >= '2020-01-01 00:00:00' AND LOWER(language_code) = 'en'" + ├─subscriptions: + │ └─short_id: R4QwwB + └─status: JOB_STATE_RUNNING + +After its execution, the object ``my_explain.job_results`` contains details about the job itself and the estimated volume. \ No newline at end of file diff --git a/docs/source/gettingstarted/envvariables.rst b/docs/source/gettingstarted/envvariables.rst index c87d823..ae11c3d 100644 --- a/docs/source/gettingstarted/envvariables.rst +++ b/docs/source/gettingstarted/envvariables.rst @@ -1,9 +1,13 @@ +.. _gettingstarted_envvariables: + Environment Variables ===================== When a class is instantiated, depending on the functionality some Environment Variables might be -required unless a value is specified. +required unless a value is specified explicitly in the code. + +.. _gettingstarted_envvariables_auth: Authentication -------------- @@ -11,25 +15,39 @@ Authentication UserKey ^^^^^^^ -* ``FACTIVA_USERKEY`` : Assigned API user key. E.g. ``abcd1234abcd1234abcd1234abcd1234``. Used in all services except ArticleRetrieval. +* ``FACTIVA_USERKEY``: Assigned API user key. E.g. ``abcd1234abcd1234abcd1234abcd1234``. + Used in all services except ArticleFetcher. OAuthUser ^^^^^^^^^ -* ``FACTIVA_CLIENTID`` : Assigned OAuth Client ID. E.g. ``0abcd1wxyz2abcd3wxyz4abcd5wxyz6o``. Required for ArticleRetrieval. -* ``FACTIVA_USERNAME`` : Assigned OAuth Username. E.g. ``9ZZZ000000-svcaccount@dowjones.com``. Required for ArticleRetrieval. -* ``FACTIVA_PASSWORD`` : Assigned OAuth Password. E.g. ``pa55WOrdpa55WOrd``. Required for ArticleRetrieval. +* ``FACTIVA_CLIENTID``: Assigned OAuth Client ID. E.g. ``0abcd1wxyz2abcd3wxyz4abcd5wxyz6o``. + Required for ArticleFetcher. +* ``FACTIVA_USERNAME``: Assigned OAuth Username. E.g. ``0XXX000000-svcaccount@dowjones.com``. + Required for ArticleFetcher. +* ``FACTIVA_PASSWORD``: Assigned OAuth Password. E.g. ``pa55WOrdpa55WOrd``. + Required for ArticleFetcher. -Logging -^^^^^^^ -* ``FACTIVA_LOGLEVEL``: Level of detail for the logs. Accepted values are ``DEBUG``, ``INFO`` (default), ``WARNING``, ``ERROR``, ``CRITICAL``. +.. _gettingstarted_envvariables_snapshots: + +Snapshots & Streams +------------------- +* ``FACTIVA_WHERE``: Query where statement that will be used when creating a new Snapshots + or Streams object with no where/query parameter. +* ``FACTIVA_SUBSCRIPTIONID``: Subscription ID from an existing Streaming Instance. E.g. + ``dj-synhub-stream-abcd1234abcd1234abcd1234abcd1234-1234abcxyz-filtered-abc123``. -Streams + +.. _gettingstarted_envvariables_logging: + +Logging ------- -* ``FACTIVA_SUBSCRIPTIONID`` : Subscription ID from an existing Streaming Instance. E.g. ``dj-synhub-stream-abcd1234abcd1234abcd1234abcd1234-1234abcxyz-filtered-abc123``. +* ``FACTIVA_LOGLEVEL``: Level of detail for the logs. + Accepted values are ``DEBUG``, ``INFO`` (`default`), ``WARNING``, ``ERROR``, ``CRITICAL``. + Handlers and Data Processing diff --git a/docs/source/gettingstarted/installation.rst b/docs/source/gettingstarted/installation.rst index 046a347..e90bf04 100644 --- a/docs/source/gettingstarted/installation.rst +++ b/docs/source/gettingstarted/installation.rst @@ -1,14 +1,14 @@ Installation ============ -PIP ---- +Regular installation using PIP +------------------------------ This package can be installed using PIP. The recommended pocedure is running: - .. code-block:: +.. code-block:: - pip install -u factiva-analytics + pip install -u factiva-analytics This will install and/or update the package to the latest official release. @@ -20,7 +20,7 @@ Alternatively it can be installed directly from GitHub by running: package guidelines establish that the ``main`` branch is also the latest official release. However, this method allows to install pre-release versions in any of the available branches -in the repository like ``dev``. +of the repository like ``dev``. Optional Packages @@ -33,5 +33,39 @@ require the installation of additional packages like Elasticsearch or Google Clo This is the list of optional packages. Installing them is recommended as long as these components will be used within the solution. -* elasticsearch: Used in a Streams custom handler and bulk data import. -* bigquery: Used in a Streams custom handler and bulk data import. +* **elasticsearch**: Used in a Streams custom handler and bulk data import. + + .. code-block:: + + pip install factiva-analytics[elasticsearch] + +* **bigquery**: Used in a Streams custom handler and bulk data import. + + .. code-block:: + + pip install factiva-analytics[bigquery] + +* **MongoDB**: Used in a Stream custom handler and bulk data import. + + .. code-block:: + + pip install factiva-analytics[mongodb] + + +* **MongoDB**: Used in a Stream custom handler and bulk data import. + + .. code-block:: + + pip install factiva-analytics[mongodb] + + +Dev Environment +--------------- + +These packages are not required for production use. + +* **dev**: Includes packages for testing and development. This code sample assumes the git repository is cloned and the command is executed from the root directory. It installs the library in editable mode along with the development dependencies. + + .. code-block:: + + pip install -e .[dev] diff --git a/docs/source/gettingstarted/overview.rst b/docs/source/gettingstarted/overview.rst index 0e1b701..85df49a 100644 --- a/docs/source/gettingstarted/overview.rst +++ b/docs/source/gettingstarted/overview.rst @@ -1,7 +1,9 @@ Overview ======== -Factiva News is a package that provides utilities to ease the integration to Factiva Analytics -APIs which is part of the Dow Jones Developer Platform. This packages aims to ease news data -operations like estimations, extractions and real-time consumption. +``factiva-analytics`` is a package that provides tools to ease the integration of Factiva Analytics +APIs which is part of the Dow Jones Developer Platform family. This packages aims to simplify news +data operations like estimations, extractions and real-time consumption. + + diff --git a/docs/source/index.rst b/docs/source/index.rst index c703062..2bf05c4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,7 +17,7 @@ Check out the `Dow Jones Developer Portal `_ for .. toctree:: :maxdepth: 2 - :caption: Main Concepts + :caption: Core Concepts :glob: concepts/auth @@ -27,7 +27,7 @@ Check out the `Dow Jones Developer Portal `_ for concepts/updates concepts/streams concepts/lists - concepts/articleretrieval + concepts/articlefetech concepts/advancedqueries .. toctree:: @@ -40,4 +40,4 @@ Check out the `Dow Jones Developer Portal `_ for factiva.analytics/snapshotexplain factiva.analytics/snapshottimeseries factiva.analytics/snapshotextraction - factiva.analytics/article_retrieval + factiva.analytics/article_fetcher diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..97825da --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[build-system] +# XXX: If your project needs other packages to build properly, add them to this list. +requires = ["setuptools >= 64"] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index cc439b3..f7ba575 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup +from setuptools import setup, find_packages with open("README.rst", "r") as fh: long_desc = fh.read() @@ -23,12 +23,12 @@ # Warning: the folder 'factiva' should NOT have an __init__.py file to avoid conflicts with the same namespace across other packages package_dir={'': 'src'}, - packages=['factiva.analytics', 'factiva.analytics.article_retrieval', 'factiva.analytics.auth', + packages=['factiva.analytics', 'factiva.analytics.article_fetcher', 'factiva.analytics.auth', 'factiva.analytics.common', 'factiva.analytics.taxonomy', 'factiva.analytics.snapshots', - 'factiva.analytics.streams' - #, 'factiva.analytics.integration' + 'factiva.analytics.streams', 'factiva.analytics.integration' # , 'factiva.analytics.lists', ], + # packages=find_packages(where='src'), url='https://developer.dowjones.com/', project_urls={ "GitHub": "https://github.com/dowjones/factiva-analytics-python", @@ -48,30 +48,41 @@ # Indicate who your project is intended for 'Intended Audience :: Science/Research', 'Intended Audience :: Financial and Insurance Industry', + 'Intended Audience :: Healthcare Industry', 'Operating System :: OS Independent', 'Topic :: Office/Business :: News/Diary', + 'Topic :: Office/Business :: Financial', 'Topic :: Office/Business :: Financial :: Investment', 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Scientific/Engineering :: Visualization', + 'Topic :: Text Processing :: General', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13' ], keywords='news, news aggregator, risk, compliance, nlp, alternative data, factiva, trading news, market movers', # Required version conditioned by typed lists and Pandas - python_requires='>=3.8.4', + python_requires='>=3.10.0', install_requires=[ - 'requests>=2.28.1', - 'pandas>=1.5.2', - 'fastavro>=1.7.0', - 'google-cloud-core>=2.3.2', - 'google-cloud-pubsub>=2.13.11' + 'requests>=2.30.0', + 'pandas>=2.2.0', + 'fastavro>=1.9.0', + 'google-cloud-core>=2.4.0', + 'google-cloud-pubsub>=2.26.0' ], extras_require={ - "MongoDB": ["pymongo"], - "Elasticsearch": ["elasticsearch"], - "BigQuery": ["google-cloud-bigquery"] + 'dev': [ + 'pytest', + 'sphinx', + 'furo', + 'sphinx-inline-tabs', + 'sphinx-copybutton' + ], + 'mongodb': ['pymongo'], + 'elasticsearch': ['elasticsearch'], + 'bigquery': ['google-cloud-bigquery'] }) diff --git a/src/factiva/analytics/__init__.py b/src/factiva/analytics/__init__.py index f10bf95..4d15c97 100755 --- a/src/factiva/analytics/__init__.py +++ b/src/factiva/analytics/__init__.py @@ -2,23 +2,27 @@ Define methods and properties for tools module. """ __all__ = [ - 'ArticleRetrieval', 'UIArticle', - 'UserKey', 'OAuthUser', + 'ArticleFetcher', 'UIArticle', + 'UserKey', 'OAuthUser', 'AccountInfo', 'FactivaTaxonomy', 'FactivaTaxonomyCategories', 'SnapshotExplain', 'SnapshotExplainQuery', 'SnapshotExplainJobResponse', 'SnapshotExplainSamplesResponse', 'SnapshotTimeSeries', 'SnapshotTimeSeriesQuery', 'SnapshotTimeSeriesJobReponse', 'SnapshotExtraction', 'SnapshotExtractionQuery', 'SnapshotExtractionJobReponse', - 'StreamingInstance', 'StreamingQuery', 'StreamingSubscription' + 'SnapshotExtractionList', 'SnapshotExtractionListItem', + 'StreamingInstance', 'StreamingQuery', 'StreamingSubscription', + 'StreamingInstanceList', 'StreamingInstanceListItem', + 'SnapshotFiles' ] from .__version__ import __version__ -from .article_retrieval import ArticleRetrieval, UIArticle -from .auth import UserKey, OAuthUser +from .article_fetcher import ArticleFetcher, UIArticle +from .auth import UserKey, OAuthUser, AccountInfo from .taxonomy import FactivaTaxonomy, FactivaTaxonomyCategories from .snapshots import SnapshotExplain, SnapshotExplainQuery, SnapshotExplainJobResponse, SnapshotExplainSamplesResponse from .snapshots import SnapshotTimeSeries, SnapshotTimeSeriesQuery, SnapshotTimeSeriesJobReponse -from .snapshots import SnapshotExtraction, SnapshotExtractionQuery, SnapshotExtractionJobReponse -from .streams import StreamingInstance, StreamingQuery, StreamingSubscription +from .snapshots import SnapshotExtraction, SnapshotExtractionQuery, SnapshotExtractionJobReponse, SnapshotExtractionList, SnapshotExtractionListItem +from .streams import StreamingInstance, StreamingQuery, StreamingSubscription, StreamingInstanceList, StreamingInstanceListItem +from .integration import SnapshotFiles # from .tools import JSONLFileHandler, BigQueryHandler, MongoDBHandler version = __version__ diff --git a/src/factiva/analytics/__version__.py b/src/factiva/analytics/__version__.py index 7414642..f0ede3d 100644 --- a/src/factiva/analytics/__version__.py +++ b/src/factiva/analytics/__version__.py @@ -1 +1 @@ -__version__ = '0.3.10' +__version__ = '0.4.1' diff --git a/src/factiva/analytics/article_fetcher/__init__.py b/src/factiva/analytics/article_fetcher/__init__.py new file mode 100644 index 0000000..502b5f3 --- /dev/null +++ b/src/factiva/analytics/article_fetcher/__init__.py @@ -0,0 +1,7 @@ +""" + Define methods and properties for the Article API Service +""" + +__all__ = ['ArticleFetcher', 'UIArticle'] + +from .article_fetcher import ArticleFetcher, UIArticle \ No newline at end of file diff --git a/src/factiva/analytics/article_retrieval/article_retrieval.py b/src/factiva/analytics/article_fetcher/article_fetcher.py similarity index 67% rename from src/factiva/analytics/article_retrieval/article_retrieval.py rename to src/factiva/analytics/article_fetcher/article_fetcher.py index 6901cc5..571a76b 100644 --- a/src/factiva/analytics/article_retrieval/article_retrieval.py +++ b/src/factiva/analytics/article_fetcher/article_fetcher.py @@ -1,15 +1,131 @@ """ - Classes to interact with the Article Retrieval endpoints. + Classes to interact with the Article API endpoints. """ +import json from ..common import const from ..common import req from ..common import articleparser as ap from ..auth import OAuthUser -class ArticleRetrieval(): +class UIArticle(): + """ + Class that represents a single article for visualization purposes. Methods + and attributes are tailored for front-end environments. + + Parameters + ---------- + article_dict : dict + A python dict with the structure returned by the Dow Jones Article + API service. + + Examples + -------- + See ArticleFetcher class examples. + + """ + + an = None + """Article unique identifier, also known as Accession Number""" + headline = None + """Article's headline, also known as title""" + source_code = None + """Article content creator's code. e.g. WSJO""" + source_name = None + """Article content creator's name. e.g. The Wall Street Journal Online""" + publication_date = None + """Article's publication date in ISO format as provided by the source. + e.g. '2022-12-03'""" + metadata = {} + """Article's metadata dict. Contains Dow Jones Intelligent Identifiers + among other codes.""" + content = {} + """Article's content dict. Full text with annotations and other UI elements.""" + included = [] + """References to objects linked to a specific article""" + relationships = {} + """References to related objects""" + + + def __init__(self, article_dict:dict) -> None: + if not isinstance(article_dict, dict): + raise ValueError('Param article_dict is not a python dict') + if ((not 'data' in article_dict.keys()) or + (not 'attributes' in article_dict['data']) or + (not 'id' in article_dict['data']) or + (not 'meta' in article_dict['data'])): + raise ValueError('Unexpected dict structure') + + self.an = article_dict['data']['id'] + self.headline = ap.extract_headline(article_dict['data']['attributes']['headline']) + self.source_code = article_dict['data']['attributes']['sources'][0]['code'] + self.source_name = article_dict['data']['attributes']['sources'][0]['name'] + self.publication_date = article_dict['data']['attributes']['publication_date'] + self.metadata = article_dict['data']['meta'] + self.content = article_dict['data']['attributes'] + if 'included' in article_dict.keys(): + self.included = article_dict['included'] + else: + self.included = [] + if 'relationships' in article_dict['data'].keys(): + self.relationships = article_dict['data']['relationships'] + else: + self.relationships = {} + + + # TODO: Improve parsing based on the an WSJO000020250109el19002bd (Links and images) + @property + def txt(self) -> str: + disp_txt = f"\n{self.headline}" + disp_txt += f"\n\n{self.source_name}, {self.publication_date}, {self.metadata['metrics']['word_count']} words\n\n" + disp_txt += ap.extract_body(self.content['body'][0], 'txt') + disp_txt += f"{ap.extract_txt(self.content['copyright'])}" + disp_txt += f"\nDocument identifier: {self.an}\n" + return disp_txt + + + # TODO: Improve parsing based on the an WSJO000020250109el19002bd (links and images) + @property + def html(self) -> str: + if 'logo' in self.content['sources'][0]: + disp_txt = f"
" + else: + disp_txt = '' + disp_txt += f"\n

{self.headline}

" + disp_txt += f"\n

{self.source_name}, {self.publication_date}, {self.metadata['metrics']['word_count']} words

" + disp_txt += ap.extract_body(self.content['body'][0], 'html') + disp_txt += f"\n
{ap.extract_txt(self.content['copyright'])}
" + disp_txt += f"\n
Document identifier: {self.an}
" + return disp_txt + + + def __repr__(self): + """Create string representation for this Class.""" + return self.__str__() + + + def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): + """Create string representation for this Class.""" + ret_val = f"{root_prefix} - [{len(self.metadata.keys())}] keys\n" + ret_val += f"{prefix}content: - [{len(self.content.keys())}] keys\n" + ret_val += f"{prefix}included: - [{len(self.included)}] items\n" + ret_val += f"{prefix[0:-2]}└─relationships: - [{len(self.relationships.keys())}] keys\n" + return ret_val + + + + + + +class ArticleFetcher(): """ - Allows to fetch articles against the Article Retrieval Service using + Allows to fetch articles against the Article API Service using the provided OAuthUser credentials. Parameters @@ -20,37 +136,37 @@ class ArticleRetrieval(): Examples -------- - Create an ArticleRetrieval instance. + Create an ArticleFetcher instance. .. code-block:: python - from factiva.analytics import ArticleRetrieval - ar = ArticleRetrieval() + from factiva.analytics import ArticleFetcher + ar = ArticleFetcher() ar .. code-block:: - + |-oauth_user: | |-client_id = fbwqyORz0te484RQTt0E7qj6Tooj4Cs6 | |-token_status = OK | |-... """ - __API_RETRIEVAL_ENDPOINT_BASEURL = f'{const.API_HOST}{const.API_RETRIEVAL_ENDPOINT_BASEURL}/' + __API_ARTICLE_ENDPOINT_BASEURL = f"{const.API_HOST}{const.API_ARTICLE_ENDPOINT_BASEURL}/" oauth_user = None """ - User instance wich provides the credentials to connect to the Article Retrieval API endpoints. + User instance wich provides the credentials to connect to the Article API endpoint. """ # TODO: Clarify how royalties work at user level. If one-click per UIUser is enough, it will - # be possible to implement a 'cache' to avoid excess of retrievals when refreshing a + # be possible to implement a 'cache' to avoid excess of fetches when refreshing a # page. # TODO: When UIArticle is implemente, this will become a list of UIArticle - # retrieved_articles = {} + # fetched_articles = {} # """ - # List that stores last retrieved articles. + # List that stores last fetched articles. # """ @@ -62,14 +178,12 @@ def __init__(self, oauth_user:OAuthUser=None) -> None: self.oauth_user = oauth_user if (not isinstance(self.oauth_user.current_jwt_token, str)) or (len(self.oauth_user.current_jwt_token.split('.')) != 3): raise ValueError('Unexpected token for the OAuthUser instance') - # self.retrieved_articles = {} + # self.fetched_articles = {} - def retrieve_single_article(self, an:str) -> dict: + def fetch_single_article(self, an:str) -> UIArticle: """ - Method that retrieves a single article to be displayed in a user interface. - The requested item is initially retrieved from the . Additionally, the retrieved data is - stored in the class atttribute ``last_retrieval_response``. + Method that gets a single article's content to be displayed in a user interface. Parameters ---------- @@ -85,21 +199,21 @@ def retrieve_single_article(self, an:str) -> dict: Examples -------- - Creating a new ``ArticleRetrieval`` instance which reads credentials values from - environment variables and retrieves a single article: + Creating a new ``Articlefetcher`` instance which reads credential values from + environment variables and returns a single article object ``UIArticle``: .. code-block:: python - from factiva.analytics import ArticleRetrieval - ar = ArticleRetrieval() - article = ar.retrieve_single_article(an='WSJO000020221229eict000jh') - article + from factiva.analytics import ArticleFetcher + ar = ArticleFetcher() + article = ar.fetch_single_article(an='WSJO000020221229eict000jh') + print(article.txt) output .. code-block:: - + |-an: WSJO000020221229eict000jh |-headline: Europe Taps Tech's Power-Hungry Data Centers to Heat Homes. |-source_code: WSJO @@ -119,15 +233,15 @@ def retrieve_single_article(self, an:str) -> dict: if (not isinstance(an, str) or (not len(an) == 25)): raise ValueError('AN parameter not valid. Length should be 25 characters.') - # if an in self.retrieved_articles.keys(): - # return self.retrieved_articles[an] - drn_ref = f'drn:archive.newsarticle.{an}' + # if an in self.fetched_articles.keys(): + # return self.fetched_articles[an] + drn_ref = f"drn:archive.newsarticle.{an}" req_headers = { - "Authorization": f'Bearer {self.oauth_user.current_jwt_token}' + "Authorization": f"Bearer {self.oauth_user.current_jwt_token}" } article_response = req.api_send_request( method="GET", - endpoint_url=f'{self.__API_RETRIEVAL_ENDPOINT_BASEURL}{drn_ref}', + endpoint_url=f"{self.__API_ARTICLE_ENDPOINT_BASEURL}{drn_ref}", headers=req_headers ) if article_response.status_code == 200: @@ -137,11 +251,11 @@ def retrieve_single_article(self, an:str) -> dict: if 'errors' in err_details.keys(): err_msg = err_details['errors'][0]['title'] raise PermissionError(err_msg) - # self.retrieved_articles = [article_obj] + # self.fetched_articles = [article_obj] return article_obj - # TODO: Implement a metod to retrieve multiple articles based on a param containing a list of ANs + # TODO: Implement a metod to fetch multiple articles based on a param containing a list of ANs def __repr__(self): @@ -154,120 +268,10 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): child_prefix = ' │' + prefix ret_val = f"{root_prefix} None: - if not isinstance(article_dict, dict): - raise ValueError('Param article_dict is not a python dict') - if ((not 'data' in article_dict.keys()) or - (not 'attributes' in article_dict['data']) or - (not 'id' in article_dict['data']) or - (not 'meta' in article_dict['data'])): - raise ValueError('Unexpected dict structure') - - self.an = article_dict['data']['id'] - self.headline = ap.extract_headline(article_dict['data']['attributes']['headline']) - self.source_code = article_dict['data']['attributes']['sources'][0]['code'] - self.source_name = article_dict['data']['attributes']['sources'][0]['name'] - self.publication_date = article_dict['data']['attributes']['publication_date'] - self.metadata = article_dict['data']['meta'] - self.content = article_dict['data']['attributes'] - if 'included' in article_dict.keys(): - self.included = article_dict['included'] - else: - self.included = [] - if 'relationships' in article_dict['data'].keys(): - self.relationships = article_dict['data']['relationships'] - else: - self.relationships = {} - - - @property - def txt(self) -> str: - disp_txt = f"\n{self.headline}" - disp_txt += f"\n\n{self.source_name}, {self.publication_date}, {self.metadata['metrics']['word_count']} words\n\n" - disp_txt += ap.extract_body(self.content['body'][0], 'txt') - disp_txt += f"{ap.extract_txt(self.content['copyright'])}" - disp_txt += f"\nDocument identifier: {self.an}\n" - return disp_txt - - - @property - def html(self) -> str: - if 'logo' in self.content['sources'][0]: - disp_txt = f"\n" - disp_txt += f"\n

{self.headline}

" - disp_txt += f"\n

{self.source_name}, {self.publication_date}, {self.metadata['metrics']['word_count']} words

" - disp_txt += ap.extract_body(self.content['body'][0], 'html') - disp_txt += f"\n

{ap.extract_txt(self.content['copyright'])}

" - disp_txt += f"\n

Document identifier: {self.an}

" - return disp_txt - - - def __repr__(self): - """Create string representation for this Class.""" - return self.__str__() - - - def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): - """Create string representation for this Class.""" - ret_val = f"{root_prefix} - [{len(self.metadata.keys())}] keys\n' - ret_val += f'{prefix}content: - [{len(self.content.keys())}] keys\n' - ret_val += f'{prefix}included: - [{len(self.included)}] items\n' - ret_val += f'{prefix[0:-2]}└─relationships: - [{len(self.relationships.keys())}] keys\n' - return ret_val - diff --git a/src/factiva/analytics/article_retrieval/__init__.py b/src/factiva/analytics/article_retrieval/__init__.py deleted file mode 100644 index cbda86d..0000000 --- a/src/factiva/analytics/article_retrieval/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -""" - Define methods and properties for the Article Retrieval Service -""" - -__all__ = ['ArticleRetrieval', 'UIArticle'] - -from .article_retrieval import ArticleRetrieval, UIArticle \ No newline at end of file diff --git a/src/factiva/analytics/auth/__init__.py b/src/factiva/analytics/auth/__init__.py index b8e496a..88f8ba8 100644 --- a/src/factiva/analytics/auth/__init__.py +++ b/src/factiva/analytics/auth/__init__.py @@ -4,7 +4,8 @@ """ -__all__ = ['UserKey', 'OAuthUser'] +__all__ = ['UserKey', 'OAuthUser', 'AccountInfo'] from .userkey import UserKey from .oauthuser import OAuthUser +from .accountinfo import AccountInfo diff --git a/src/factiva/analytics/auth/accountinfo.py b/src/factiva/analytics/auth/accountinfo.py new file mode 100755 index 0000000..b61445c --- /dev/null +++ b/src/factiva/analytics/auth/accountinfo.py @@ -0,0 +1,396 @@ +""" +This module contains classes and tools to interact with account-related +endpoints available in Factiva Analytics APIs. +""" +import json +import pandas as pd +from ..common import log, req, tools, const, config +from ..auth import UserKey +from ..snapshots import SnapshotExtractionList +from ..streams import StreamingInstanceList + +class AccountInfo: + """ + Class that represents a user-key Account and can be instantiated based on the + user-key value provided when the Factiva Analytics account was provisioned and + notified via the Welcome email. + + Parameters + ---------- + user_key : str + String containing the 32-character long APi Key. If not provided, the + constructor will try to obtain its value from the ``FACTIVA_USERKEY`` + environment variable. + + Examples + -------- + Creating a new instance taking the key value from the ``FACTIVA_USERKEY`` + environment varaible, and not requesting account statistics. + + .. code-block:: python + + from factiva.analytics import AccountInfo + ai = AccountInfo() + print(ai) + + .. code-block:: + + <'factiva.analytics.AccountInfo'> + ├─user_key: <'factiva.analytics.UserKey'> + │ ├─key: ****************************nQdu + │ └─cloud_token: **********************YKB22sJCkHXX + ├─account_name: Account-Name + ├─account_type: account_with_contract_limits + ├─active_product: DNA + ├─max_allowed_extracted_documents: 20,000,000 + ├─max_allowed_extractions: 30 + ├─currently_running_extractions: 0 + ├─total_extracted_documents: 15,315,291 + ├─total_extractions: 22 + ├─total_stream_instances: 0 + ├─total_stream_subscriptions: 0 + ├─extractions_list: + ├─streams_list: + ├─enabled_company_identifiers: + │ ├─[1]: sedol + │ ├─[3]: cusip + │ ├─[4]: isin + │ └─[5]: ticker_exchange + ├─remaining_documents: 4,684,709 + └─remaining_extractions: 8 + + Creating a new AccountInfo instance providing the ``user_key`` string explicitly and + retrieving the latest account details: + + .. code-block:: python + + from factiva.analytics import AccountInfo + ai = AccountInfo('abcd1234abcd1234abcd1234abcd1234') + print(ai) + + """ + + __API_ENDPOINT_BASEURL = f"{const.API_HOST}{const.API_ACCOUNT_BASEPATH}/" + __log = None + + user_key: UserKey = None + account_name: str = None + account_type: str = None + active_product: str = None + # max_allowed_concurrent_extractions: int = None + max_allowed_extracted_documents: int = None + max_allowed_extractions: int = None + currently_running_extractions: int = None + # total_downloaded_bytes: int = None + total_extracted_documents: int = None + total_extractions: int = None + total_stream_instances: int = None + total_stream_subscriptions: int = None + enabled_company_identifiers: list = None + stream_jobs: StreamingInstanceList = None + extraction_jobs: SnapshotExtractionList = None + time_series_jobs: pd.DataFrame = None + + + def __init__(self, user_key: UserKey | str=None): + """ + Construct the instance of the class + + """ + self.__log = log.get_factiva_logger() + self.user_key = UserKey(user_key) + self.get_stats() + self.get_extractions() + self.get_streams(running=False) + self.get_time_series() + + + @property + def remaining_extractions(self): + """ + Dynamic property that calculates the account's remaining extractions + """ + if self.max_allowed_extractions: + return self.max_allowed_extractions - self.total_extractions + return None + + + @property + def remaining_documents(self): + """ + Dynamic property that calculates the account's remaining documents + """ + if self.max_allowed_extracted_documents: + return self.max_allowed_extracted_documents - self.total_extracted_documents + return None + + + @log.factiva_logger() + def get_stats(self) -> bool: + """ + Request the account details to the Factiva Account API Endpoint. + This operation can take several seconds to complete. + + Returns + ------- + bool: + ``True`` if the operation was completed successfully. ``False`` + otherwise. All returned values are assigned to the object's + properties directly. + + """ + self.__log.info('get_stats started') + account_endpoint = f"{self.__API_ENDPOINT_BASEURL}{self.user_key.key}" + req_head = {'user-key': self.user_key.key} + resp = req.api_send_request(method='GET', endpoint_url=account_endpoint, headers=req_head) + if resp.status_code == 200: + try: + resp_obj = json.loads(resp.text) + self.account_name = resp_obj['data']['attributes']['name'] + self.account_type = resp_obj['data']['type'] + self.active_product = resp_obj['data']['attributes']['products'] + # self.max_allowed_concurrent_extractions = resp_obj['data']['attributes']['max_allowed_concurrent_extracts'] + self.max_allowed_extracted_documents = resp_obj['data']['attributes']['max_allowed_document_extracts'] + self.max_allowed_extractions = resp_obj['data']['attributes']['max_allowed_extracts'] + self.currently_running_extractions = resp_obj['data']['attributes']['cnt_curr_ext'] + # self.total_downloaded_bytes = resp_obj['data']['attributes']['current_downloaded_amount'] + self.total_extracted_documents = resp_obj['data']['attributes']['tot_document_extracts'] + self.total_extractions = resp_obj['data']['attributes']['tot_extracts'] + self.total_stream_instances = resp_obj['data']['attributes']['tot_topics'] + self.total_stream_subscriptions = resp_obj['data']['attributes']['tot_subscriptions'] + self.enabled_company_identifiers = resp_obj['data']['attributes']['enabled_company_identifiers'] + except Exception as error: + raise AttributeError('Unexpected Account Information API Response.') from error + elif resp.status_code == 403: + raise ValueError('Factiva User-Key does not exist or inactive.') + else: + raise RuntimeError('Unexpected Account Information API Error') + self.__log.info('get_stats ended') + return True + + + @log.factiva_logger() + def get_extractions(self, updates=False) -> SnapshotExtractionList: + """ + Request a list of historical extractions for the account. + + Parameters + ---------- + updates : bool + Indicates whether the retrieved list should include update + operations (``True``) or not (``False`` - default). + + Returns + ------- + padas.Dataframe: + containing the list of historical extractions for the account. + + """ + self.__log.info('get_extractions started') + endpoint = f"{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}" + + headers_dict = {'user-key': self.user_key.key} + + response = req.api_send_request(method='GET', endpoint_url=endpoint, headers=headers_dict) + + if response.status_code != 200: + if response.status_code == 403: + raise ValueError('Factiva API-Key does not exist or inactive.') + + raise RuntimeError(f"Unexpected API Error with message: {response.text}") + + response_data = response.json() + if response_data['data'] == []: + extraction_df = pd.DataFrame() + else: + extraction_df = pd.DataFrame([tools.flatten_dict(extraction) for extraction in response_data['data']]) + extraction_df.rename(columns={'id': 'object_id'}, inplace=True) + ids_df = extraction_df['object_id'].str.split('-', expand=True) + + if ids_df.shape[1] >= 5: + extraction_df['short_id'] = ids_df[4] + else: + extraction_df['short_id'] = None + + if ids_df.shape[1] >= 7: + extraction_df['update_id'] = ids_df[6] + else: + extraction_df['update_id'] = None + + extraction_df.drop(['self', 'type'], axis=1, inplace=True) + + if not updates: + extraction_df = extraction_df.loc[extraction_df.update_id.isnull()] + + self.extraction_jobs = SnapshotExtractionList(extraction_df) + self.__log.info('get_extractions ended') + return self.extraction_jobs + + + @log.factiva_logger() + def get_streams(self, running=True) -> StreamingInstanceList: + """ + Retrieves the list of streams for the user. + + Parameters + ---------- + running : bool + Indicates whether the retrieved list should be restricted + to only running streams (``True`` - default) or also include + historical ones (``False``). + + Returns + ------- + pandas.DataFrame: + DataFrame with the list of historical extractions + + """ + self.__log.info('get_streams started') + request_headers = {'user-key': self.user_key.key} + response = req.api_send_request( + method="GET", + endpoint_url=f"{const.API_HOST}{const.API_STREAMS_BASEPATH}", + headers=request_headers + ) + if response.status_code == 200: + try: + def extract_subscriptions(subscription): + id_list = [] + for i in subscription: + s_idp = i['id'].split('-') + s_id = f"{s_idp[-3]}-{s_idp[-2]}-{s_idp[-1]}" + id_list.append(s_id) + return id_list + + response_data = response.json() + stream_df = pd.DataFrame([tools.flatten_dict(extraction) for extraction in response_data['data']]) + stream_df.rename(columns={'id': 'stream_id'}, inplace=True) + ids_df = stream_df['stream_id'].str.split('-', expand=True) + stream_df['short_id'] = ids_df[4] + stream_df['stream_type'] = ids_df[2] + stream_df['subscriptions'] = stream_df['data'].apply(extract_subscriptions) + stream_df['n_subscriptions'] = stream_df['subscriptions'].str.len() + stream_df.drop(['self', 'type', 'data'], axis=1, inplace=True) + + if running: + stream_df = stream_df.loc[stream_df.job_status == const.API_JOB_RUNNING_STATE] + except Exception as error: + raise AttributeError('Unexpected Get Streams API Response.') from error + elif response.status_code == 404: + stream_df = pd.DataFrame() + elif response.status_code == 403: + raise ValueError('Factiva API-Key does not exist or disabled.') + else: + raise RuntimeError('Unexpected Get Streams API Error') + + self.stream_jobs = StreamingInstanceList(stream_df) + self.__log.info('get_streams ended') + return stream_df + + + + @log.factiva_logger() + def get_time_series(self) -> pd.DataFrame: # TODO: Change return type + """ + Retrieves the list of streams for the user. + + Parameters + ---------- + running : bool + Indicates whether the retrieved list should be restricted + to only running streams (``True`` - default) or also include + historical ones (``False``). + + Returns + ------- + pandas.DataFrame: + DataFrame with the list of historical extractions + + """ + self.__log.info('get_time_series started') + request_headers = {'user-key': self.user_key.key} + response = req.api_send_request( + method="GET", + endpoint_url=f"{const.API_HOST}{const.API_ANALYTICS_BASEPATH}", + headers=request_headers + ) + if response.status_code == 200: + try: + response_data = response.json()['data'] + if response_data == []: + time_series_df = pd.DataFrame() + self.__log.info('No time series jobs found.') + else: + all_ts = [] + for ts_entry in response_data: + new_ts = { + 'created_datetime': ts_entry['attributes']['date_created'], + 'job_id': ts_entry['id'], + 'job_state': ts_entry['attributes']['current_state'] + } + all_ts.append(new_ts) + + time_series_df = pd.DataFrame(all_ts) + time_series_df.sort_values(by='created_datetime', inplace=True, ascending=False) + + except Exception as error: + raise AttributeError('Unexpected Get Streams API Response.') from error + elif response.status_code == 404: + stream_df = pd.DataFrame() + elif response.status_code == 403: + raise ValueError('Factiva API-Key does not exist or disabled.') + else: + raise RuntimeError('Unexpected Get Streams API Error') + + self.__log.info('get_time_series ended') + self.time_series_jobs = time_series_df + return time_series_df + + + def is_active(self) -> bool: + request_headers = {'user-key': self.user_key.key} + response = req.api_send_request( + method="GET", + endpoint_url=f"{const.API_HOST}{const.API_SNAPSHOTS_TAXONOMY_BASEPATH}", + headers=request_headers + ) + if response.status_code == 200: + return True + else: + return False + + + def __repr__(self): + """Return a string representation of the object.""" + return self.__str__() + + + def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): + pprop = self.__dict__.copy() + del pprop['_AccountInfo__log'] + del pprop['user_key'] + del pprop['account_name'] + del pprop['account_type'] + del pprop['active_product'] + del pprop['enabled_company_identifiers'] + + ret_val = f"{root_prefix}<'factiva.analytics.{str(self.__class__).split('.')[-1]}" + ret_val += f"\n{prefix}user_key: {self.user_key.__str__(detailed=False, prefix=' │ ├─')}" + ret_val += f"\n{prefix}account_name: {tools.print_property(self.account_name, '')}" + ret_val += f"\n{prefix}account_type: {tools.print_property(self.account_type, '')}\n" + ret_val += "\n".join((f"{prefix}{item}: {tools.print_property(pprop[item], '')}" for item in pprop)) + ret_val += f"\n{prefix}enabled_company_identifiers:" + if len(self.enabled_company_identifiers) >= 1: + ci_list = [f"\n{prefix.replace('├', '│')[0:-1]} ├─[{ci['id']}]: {ci['name']}" for ci in self.enabled_company_identifiers] + ci_list.sort() + ci_list[-1] = ci_list[-1].replace('├', '└') + for ci in ci_list: + ret_val += ci + else: + ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─" + ret_val += f"\n{prefix}remaining_documents: {tools.print_property(self.remaining_documents, '')}" + ret_val += f"\n{prefix[0:-2]}└─remaining_extractions: {tools.print_property(self.remaining_extractions, '')}" + + return ret_val + diff --git a/src/factiva/analytics/auth/oauthuser.py b/src/factiva/analytics/auth/oauthuser.py index 60fd57b..d734059 100755 --- a/src/factiva/analytics/auth/oauthuser.py +++ b/src/factiva/analytics/auth/oauthuser.py @@ -36,7 +36,7 @@ class OAuthUser: from factiva.analytics import OAuthUser o = OAuthUser() headers = { - 'Authorization': f'Bearer {o.current_jwt_token}' + 'Authorization': f"Bearer {o.current_jwt_token}" } Shows the relevant properties of a ``OAuthUser`` instance. @@ -45,17 +45,17 @@ class OAuthUser: from factiva.analytics import OAuthUser o = OAuthUser() - o + print(o) output .. code-block:: <'factiva.analytics.OAuthUser'> - |-client_id = ****************************4Cs6 - |-username = 9ZZZ000000-svcaccount@dowjones.com - |-password = ************gRk3 - |-token_status = not_authenticated + ├─client_id: ****************************4Cs6 + ├─username: 9zzz131500-svcaccount@dowjones.com + ├─password: ************KAHl + └─token_status: not_authenticated """ @@ -187,7 +187,8 @@ def get_id_token(self) -> bool: self._id_token = response_body['id_token'] self._access_token = response_body['access_token'] bearer_payload = eval(base64.b64decode(self._access_token.split('.')[1] + '==').decode('utf-8')) - self._id_expiration = datetime.datetime.utcfromtimestamp(int(bearer_payload['exp'])).replace(tzinfo=datetime.timezone.utc) + # self._id_expiration = datetime.datetime.utcfromtimestamp(int(bearer_payload['exp'])).replace(tzinfo=datetime.timezone.utc) + self._id_expiration = datetime.datetime.fromtimestamp(int(bearer_payload['exp']), datetime.timezone.utc) return True elif authn_response.status_code == 403: raise PermissionError('Invalid user credentials') @@ -232,7 +233,8 @@ def get_jwt_token(self) -> bool: response_body = authz_response.json() self._jwt_token = response_body["access_token"] bearer_payload = eval(base64.b64decode(self._jwt_token.split('.')[1] + '==').decode('utf-8')) - self._jwt_expiration = datetime.datetime.utcfromtimestamp(int(bearer_payload['exp'])).replace(tzinfo=datetime.timezone.utc) + # self._jwt_expiration = datetime.datetime.utcfromtimestamp(int(bearer_payload['exp'])).replace(tzinfo=datetime.timezone.utc) + self._jwt_expiration = datetime.datetime.fromtimestamp(int(bearer_payload['exp']), datetime.timezone.utc) return True @@ -244,13 +246,13 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): masked_clientid = tools.mask_string(self._client_id) ret_val = f"{root_prefix}<'factiva.analytics.{str(self.__class__).split('.')[-1]}\n" - ret_val += f'{prefix}client_id: {masked_clientid}\n' - ret_val += f'{prefix}username: {self._username}\n' + ret_val += f"{prefix}client_id: {masked_clientid}\n" + ret_val += f"{prefix}username: {self._username}\n" if detailed: masked_password = tools.mask_string(self._password) - ret_val += f'{prefix}password: {masked_password}\n' - ret_val += f'{prefix[0:-2]}└─token_status: {self.token_status}\n' + ret_val += f"{prefix}password: {masked_password}\n" + ret_val += f"{prefix[0:-2]}└─token_status: {self.token_status}\n" else: - ret_val += f'{prefix}token_status: {self.token_status}\n' + ret_val += f"{prefix}token_status: {self.token_status}\n" ret_val += f"{prefix[0:-2]}└─..." return ret_val diff --git a/src/factiva/analytics/auth/userkey.py b/src/factiva/analytics/auth/userkey.py index cd65385..ec4164b 100755 --- a/src/factiva/analytics/auth/userkey.py +++ b/src/factiva/analytics/auth/userkey.py @@ -1,10 +1,10 @@ """ -This module contains classes and tools to instantiate and support the lifecycle -of UserKey objects. UserKey is the most used authentication method within +This module contains classes and tools to manage UserKey objects lifecycle. +UserKey is the most used authentication method within Factiva Analytics APIs. """ import json -import pandas as pd +# import pandas as pd from ..common import log, req, tools, const, config @@ -15,30 +15,15 @@ class UserKey: """ - __API_ENDPOINT_BASEURL = f'{const.API_HOST}{const.API_ACCOUNT_BASEPATH}/' - __API_CLOUD_TOKEN_URL = f'{const.API_HOST}{const.ALPHA_BASEPATH}{const.API_ACCOUNT_STREAM_CREDENTIALS_BASEPATH}' + # __API_ENDPOINT_BASEURL = "{const.API_HOST}{const.API_ACCOUNT_BASEPATH}/"" + __API_CLOUD_TOKEN_URL = f"{const.API_HOST}{const.API_ACCOUNT_STREAM_CREDENTIALS_BASEPATH}" __log = None key: str = None cloud_token: dict = None - account_name: str = None - account_type: str = None - active_product: str = None - max_allowed_concurrent_extractions: int = None - max_allowed_extracted_documents: int = None - max_allowed_extractions: int = None - currently_running_extractions: int = None - total_downloaded_bytes: int = None - total_extracted_documents: int = None - total_extractions: int = None - total_stream_instances: int = None - total_stream_subscriptions: int = None - enabled_company_identifiers: list = None - streams: pd.DataFrame = None - snapshots: pd.DataFrame = None - def __init__(self, key=None, stats=False): + def __init__(self, key=None): """ Construct the instance of the class @@ -48,79 +33,6 @@ def __init__(self, key=None, stats=False): String containing the 32-character long APi Key. If not provided, the constructor will try to obtain its value from the ``FACTIVA_USERKEY`` environment variable. - stats : bool - Indicates if user data has to be pulled from the server at creation - time (``True``) or just create an instance with no stats data - (``False`` - default). This operation fills account detail properties - along with maximum, used and remaining values. It may take several - seconds to complete. - - Examples - -------- - Creating a new UserKey instance providing the ``key`` string explicitly and - retrieving the latest account details: - - .. code-block:: python - - from factiva.analytics import UserKey - u = UserKey('abcd1234abcd1234abcd1234abcd1234', True) - print(u) - - .. code-block:: - - - ├─key: ****************************1234 - ├─cloud_token: - ├─account_name: AccName1234 - ├─account_type: account_with_contract_limits - ├─active_product: DNA - ├─max_allowed_concurrent_extractions: 5 - ├─max_allowed_extracted_documents: 200,000 - ├─max_allowed_extractions: 3 - ├─currently_running_extractions: 0 - ├─total_downloaded_bytes: 7,253,890 - ├─total_extracted_documents: 2,515 - ├─total_extractions: 1 - ├─total_stream_instances: 4 - ├─total_stream_subscriptions: 1 - ├─enabled_company_identifiers: - | ├─[1]: sedol - | ├─[3]: cusip - | ├─[4]: isin - | ├─[5]: ticker_exchange - ├─remaining_documents: 197,485 - └─remaining_extractions: 2 - - Creating a new instance taking the key value from the ``FACTIVA_USERKEY`` - environment varaible, and not requesting account statistics. - - .. code-block:: python - - from factiva.analytics import UserKey - u = UserKey() - print(u) - - .. code-block:: - - - ├─key: ****************************1234 - ├─cloud_token: - ├─account_name: - ├─account_type: - ├─active_product: - ├─max_allowed_concurrent_extractions: 0 - ├─max_allowed_extracted_documents: 0 - ├─max_allowed_extractions: 0 - ├─currently_running_extractions: 0 - ├─total_downloaded_bytes: 0 - ├─total_extracted_documents: 0 - ├─total_extractions: 0 - ├─total_stream_instances: 0 - ├─total_stream_subscriptions: 0 - ├─enabled_company_identifiers: - │ └─ - ├─remaining_documents: 0 - └─remaining_extractions: 0 """ self.__log = log.get_factiva_logger() @@ -136,166 +48,10 @@ def __init__(self, key=None, stats=False): raise ValueError('Factiva User-Key has the wrong length') self.key = key - self.cloud_token = {} - - if stats is True: - self.get_stats() + if self.is_active(): + self.get_cloud_token() else: - self.account_name = None - self.account_type = None - self.active_product = None - self.max_allowed_concurrent_extractions = None - self.max_allowed_extracted_documents = None - self.max_allowed_extractions = None - self.currently_running_extractions = None - self.total_downloaded_bytes = None - self.total_extracted_documents = None - self.total_extractions = None - self.total_stream_instances = None - self.total_stream_subscriptions = None - self.enabled_company_identifiers = [] - self.streams = [] - self.snapshots = [] - - - @property - def remaining_extractions(self): - """ - Dynamic property that calculates the account's remaining extractions - """ - if self.max_allowed_extractions: - return self.max_allowed_extractions - self.total_extractions - return None - - - @property - def remaining_documents(self): - """ - Dynamic property that calculates the account's remaining documents - """ - if self.max_allowed_extracted_documents: - return self.max_allowed_extracted_documents - self.total_extracted_documents - return None - - # @property - # def extractions_done(self): - # """Number of executed extractions""" - # return self.get_extractions() - - # @property - # def streams_running(self): - # """Number of currently running Streaming Instances""" - # return self.get_streams() - - - @log.factiva_logger() - def get_stats(self) -> bool: - """ - Request the account details to the Factiva Account API Endpoint. - This operation can take several seconds to complete. - - Returns - ------- - bool: - ``True`` if the operation was completed successfully. ``False`` - otherwise. All returned values are assigned to the object's - properties directly. - - Examples - -------- - Creates a local ``UserKey`` instance and then retrieves the stats. - - .. code-block:: python - - from factiva.analytics import UserKey - u = UserKey('abcd1234abcd1234abcd1234abcd1234') - print(u) - - output - - .. code-block:: - - - ├─key: ****************************1234 - ├─cloud_token: - ├─account_name: - ├─account_type: - ├─active_product: - ├─max_allowed_concurrent_extractions: 0 - ├─max_allowed_extracted_documents: 0 - ├─max_allowed_extractions: 0 - ├─currently_running_extractions: 0 - ├─total_downloaded_bytes: 0 - ├─total_extracted_documents: 0 - ├─total_extractions: 0 - ├─total_stream_instances: 0 - ├─total_stream_subscriptions: 0 - ├─enabled_company_identifiers: - │ └─ - ├─remaining_documents: 0 - └─remaining_extractions: 0 - - .. code-block:: python - - u.get_stats() - print(u) - - output - - .. code-block:: - - - ├─key: ****************************1234 - ├─cloud_token: - ├─account_name: AccName1234 - ├─account_type: account_with_contract_limits - ├─active_product: DNA - ├─max_allowed_concurrent_extractions: 5 - ├─max_allowed_extracted_documents: 200,000 - ├─max_allowed_extractions: 3 - ├─currently_running_extractions: 0 - ├─total_downloaded_bytes: 7,253,890 - ├─total_extracted_documents: 2,515 - ├─total_extractions: 1 - ├─total_stream_instances: 4 - ├─total_stream_subscriptions: 1 - ├─enabled_company_identifiers: - | ├─[1]: sedol - | ├─[3]: cusip - | ├─[4]: isin - | ├─[5]: ticker_exchange - ├─remaining_documents: 197,485 - └─remaining_extractions: 2 - - """ - self.__log.info('get_stats started') - account_endpoint = f'{self.__API_ENDPOINT_BASEURL}{self.key}' - req_head = {'user-key': self.key} - resp = req.api_send_request(method='GET', endpoint_url=account_endpoint, headers=req_head) - if resp.status_code == 200: - try: - resp_obj = json.loads(resp.text) - self.account_name = resp_obj['data']['attributes']['name'] - self.account_type = resp_obj['data']['type'] - self.active_product = resp_obj['data']['attributes']['products'] - self.max_allowed_concurrent_extractions = resp_obj['data']['attributes']['max_allowed_concurrent_extracts'] - self.max_allowed_extracted_documents = resp_obj['data']['attributes']['max_allowed_document_extracts'] - self.max_allowed_extractions = resp_obj['data']['attributes']['max_allowed_extracts'] - self.currently_running_extractions = resp_obj['data']['attributes']['cnt_curr_ext'] - self.total_downloaded_bytes = resp_obj['data']['attributes']['current_downloaded_amount'] - self.total_extracted_documents = resp_obj['data']['attributes']['tot_document_extracts'] - self.total_extractions = resp_obj['data']['attributes']['tot_extracts'] - self.total_stream_instances = resp_obj['data']['attributes']['tot_topics'] - self.total_stream_subscriptions = resp_obj['data']['attributes']['tot_subscriptions'] - self.enabled_company_identifiers = resp_obj['data']['attributes']['enabled_company_identifiers'] - except Exception as error: - raise AttributeError('Unexpected Account Information API Response.') from error - elif resp.status_code == 403: raise ValueError('Factiva User-Key does not exist or inactive.') - else: - raise RuntimeError('Unexpected Account Information API Error') - self.__log.info('get_stats ended') - return True @log.factiva_logger() @@ -315,7 +71,7 @@ def get_cloud_token(self) -> bool: req_head = {'user-key': self.key} response = req.api_send_request( method="GET", - endpoint_url=f'{self.__API_CLOUD_TOKEN_URL}', + endpoint_url=f"{self.__API_CLOUD_TOKEN_URL}", headers=req_head ) @@ -330,214 +86,17 @@ def get_cloud_token(self) -> bool: except TypeError as type_error: raise ValueError('Unable to get a cloud token for the given key. This account might have limited access.') from type_error - self.cloud_token = json.loads(streaming_credentials_string) + if streaming_credentials_string is not None: + self.cloud_token = json.loads(streaming_credentials_string) self.__log.info('get_cloud_token ended') return True - @log.factiva_logger() - def get_extractions(self, updates=False) -> pd.DataFrame: - """ - Request a list of historical extractions for the account. - - Parameters - ---------- - updates : bool - Indicates whether the retrieved list should include update - operations (``True``) or not (``False`` - default). - - Returns - ------- - padas.Dataframe: - containing the list of historical extractions for the account. - - """ - self.__log.info('get_extractions started') - endpoint = f'{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}' - - headers_dict = {'user-key': self.key} - - response = req.api_send_request(method='GET', endpoint_url=endpoint, headers=headers_dict) - - if response.status_code != 200: - if response.status_code == 403: - raise ValueError('Factiva API-Key does not exist or inactive.') - - raise RuntimeError(f'Unexpected API Error with message: {response.text}') - - response_data = response.json() - - extraction_df = pd.DataFrame([tools.flatten_dict(extraction) for extraction in response_data['data']]) - extraction_df.rename(columns={'id': 'object_id'}, inplace=True) - ids_df = extraction_df['object_id'].str.split('-', expand=True) - - if ids_df.shape[1] >= 5: - extraction_df['snapshot_sid'] = ids_df[4] - else: - extraction_df['snapshot_sid'] = None - - if ids_df.shape[1] >= 7: - extraction_df['update_id'] = ids_df[6] - else: - extraction_df['update_id'] = None - - extraction_df.drop(['self', 'type'], axis=1, inplace=True) - - if not updates: - extraction_df = extraction_df.loc[extraction_df.update_id.isnull()] - - self.__log.info('get_extractions ended') - return extraction_df - - - def show_extractions(self, updates=False): - """ - Shows the list of historical extractions for the account. Intended - to be used in notebooks or manual Python command executions. - - Parameters - ---------- - updates : bool - Indicates whether the retrieved list should include update - operations (``True``) or not (``False`` - default). - - Returns - ------- - nothing: - Displays a table with the extraction list. - - Examples - -------- - Show the historical extractions for the current user: - - .. code-block:: python - - from factiva.analytics import UserKey - u = UserKey() - u.show_extractions() - - .. code-block:: - - current_state format extraction_type snapshot_sid update_id - 0 JOB_STATE_DONE avro documents 0pjfkz33ra None - 1 JOB_STATE_DONE json documents 0rsfemt846 None - 2 JOB_STATE_DONE json documents 1snv7pjx1a None - 3 JOB_STATE_DONE json documents 2toxzrekx1 None - 4 JOB_STATE_DONE csv documents 2udvglt9xy None - .. ... ... ... ... ... - 12 JOB_STATE_DONE avro documents re9xq88syg None - 13 JOB_STATE_DONE json documents wfbf3eacz8 None - 14 JOB_STATE_DONE json documents ymhsvx20tl None - 15 JOB_STATE_DONE json documents yonrtw2hbe None - 16 JOB_STATE_DONE avro documents zpxgqyrqgr None - - """ - extractions = self.get_extractions(updates=updates) - print(extractions.loc[:, extractions.columns != 'object_id']) - - - @log.factiva_logger() - def get_streams(self, running=True) -> pd.DataFrame: - """ - Retrieves the list of streams for the user. - - Parameters - ---------- - running : bool - Indicates whether the retrieved list should be restricted - to only running streams (``True`` - default) or also include - historical ones (``False``). - - Returns - ------- - pandas.DataFrame: - DataFrame with the list of historical extractions - - """ - self.__log.info('get_streams started') - request_headers = {'user-key': self.key} - response = req.api_send_request( - method="GET", - endpoint_url=f'{const.API_HOST}{const.API_STREAMS_BASEPATH}', - headers=request_headers - ) - if response.status_code == 200: - try: - def extract_subscriptions(subscription): - id_list = [] - for i in subscription: - s_idp = i['id'].split('-') - s_id = f"{s_idp[-3]}-{s_idp[-2]}-{s_idp[-1]}" - id_list.append(s_id) - return id_list - - response_data = response.json() - stream_df = pd.DataFrame([tools.flatten_dict(extraction) for extraction in response_data['data']]) - stream_df.rename(columns={'id': 'object_id'}, inplace=True) - ids_df = stream_df['object_id'].str.split('-', expand=True) - stream_df['stream_id'] = ids_df[4] - stream_df['stream_type'] = ids_df[2] - stream_df['subscriptions'] = stream_df['data'].apply(extract_subscriptions) - stream_df['n_subscriptions'] = stream_df['subscriptions'].str.len() - stream_df.drop(['self', 'type', 'data'], axis=1, inplace=True) - - if running: - stream_df = stream_df.loc[stream_df.job_status == const.API_JOB_RUNNING_STATE] - - self.__log.info('get_streams ended') - return stream_df - except Exception as error: - raise AttributeError('Unexpected Get Streams API Response.') from error - elif response.status_code == 403: - raise ValueError('Factiva API-Key does not exist or inactive.') - else: - raise RuntimeError('Unexpected Get Streams API Error') - - - def show_streams(self, running=True): - """ - Shows the list of streams for a given user. - - This function runs the existing function get_streams and - prints a user-friendly table with stream details. - - Parameters - ---------- - running : bool - Flag that indicates whether the displayed list should be restricted - to only running streams (True) or also include cancelled and failed - ones (False). - - Returns - ------- - nothing: - Displays a table with the extraction list. - - Examples - -------- - Show running streams: - - .. code-block:: python - - from factiva.analytics import UserKey - u = UserKey() - u.show_streams() - - .. code-block:: - - job_status stream_id stream_type subscriptions n_subscriptions - 1 JOB_STATE_RUNNING kmzx8wrbzs stream [kmzx8wrbzs-filtered-1nJvA5] 1 - - """ - account_streams = self.get_streams(running=running) - print(account_streams.loc[:, account_streams.columns != 'object_id']) - - def is_active(self) -> bool: request_headers = {'user-key': self.key} response = req.api_send_request( method="GET", - endpoint_url=f'{const.API_HOST}{const.API_SNAPSHOTS_TAXONOMY_BASEPATH}', + endpoint_url=f"{const.API_HOST}{const.API_SNAPSHOTS_TAXONOMY_BASEPATH}", headers=request_headers ) if response.status_code == 200: @@ -546,88 +105,22 @@ def is_active(self) -> bool: return False - @staticmethod - def _create_user_key(key=None, stats=False): - """ - Private method. - Determine the way to initialize an api key user according to the type - of parameter provided. - - Parameters - ---------- - api_user : None, str, UserKey - Source to create a UserKey instance - stats : boolean, optional (Default: False) - Indicates if user data has to be pulled from the server - - Returns - ------- - UserKey: - - When None is passed, UserKey instance using credentials from ENV variables - - When str is passed, UserKey instance using the provided parameter as credentials - - When UserKey is passed, it returns the same instance - - """ - if isinstance(key, UserKey): - return key - - if isinstance(key, str): - try: - return UserKey(key, stats=stats) - except Exception as error: - raise RuntimeError("User cannot be obtained from the provided key.") from error - - if key is None: - try: - return UserKey(stats=stats) - except Exception as error: - raise RuntimeError("User cannot be obtained from ENV variables") from error - - raise RuntimeError("Unexpected api_user value") - - def __repr__(self): """Return a string representation of the object.""" return self.__str__() def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): - pprop = self.__dict__.copy() - del pprop['_UserKey__log'] - del pprop['key'] - del pprop['cloud_token'] - del pprop['account_name'] - del pprop['account_type'] - del pprop['active_product'] - del pprop['enabled_company_identifiers'] masked_key = tools.mask_string(self.key) - if self.cloud_token == {}: + if not self.cloud_token: masked_token = '' else: masked_token = tools.mask_string(self.cloud_token['private_key'][58:92], 12) ret_val = f"{root_prefix}<'factiva.analytics.{str(self.__class__).split('.')[-1]}" - ret_val += f'\n{prefix}key: {masked_key}' + ret_val += f"\n{prefix}key: {masked_key}" + ret_val += f"\n{prefix[0:-2]}└─cloud_token: {masked_token}" - if detailed: - ret_val += f'\n{prefix}cloud_token: {masked_token}' - ret_val += f"\n{prefix}account_name: {tools.print_property(self.account_name, '')}" - ret_val += f"\n{prefix}account_type: {tools.print_property(self.account_type, '')}\n" - ret_val += "\n".join((f"{prefix}{item}: {tools.print_property(pprop[item], '')}" for item in pprop)) - ret_val += f"\n{prefix}enabled_company_identifiers:" - if len(self.enabled_company_identifiers) >= 1: - ci_list = [f"\n{prefix.replace('├', '│')[0:-1]} ├─[{ci['id']}]: {ci['name']}" for ci in self.enabled_company_identifiers] - ci_list.sort() - ci_list[-1] = ci_list[-1].replace('├', '└') - for ci in ci_list: - ret_val += ci - else: - ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─" - ret_val += f"\n{prefix}remaining_documents: {tools.print_property(self.remaining_documents, '')}" - ret_val += f"\n{prefix[0:-2]}└─remaining_extractions: {tools.print_property(self.remaining_extractions, '')}" - else: - ret_val += f'{prefix[0:-2]}└─...' return ret_val diff --git a/src/factiva/analytics/common/articleparser.py b/src/factiva/analytics/common/articleparser.py index 4cba140..06363ba 100644 --- a/src/factiva/analytics/common/articleparser.py +++ b/src/factiva/analytics/common/articleparser.py @@ -1,6 +1,8 @@ -"""Functions to parse ArticleRetrieval JSON format""" +"""Functions to parse ArticleFetcher JSON format""" -def extract_txt(txt_dict:dict or list) -> str: +from typing import Union + +def extract_txt(txt_dict: Union[dict, list]) -> str: hl_part = '' if isinstance(txt_dict, dict): @@ -46,7 +48,7 @@ def extract_body(body_dict:dict, format='txt') -> str: for p_item in p_list: if format == 'html': - content += "\n

" + content += "\n

" content += extract_txt(p_item) if format == 'html': content += '

\n' diff --git a/src/factiva/analytics/common/config.py b/src/factiva/analytics/common/config.py index d3fc75a..6fac4b7 100644 --- a/src/factiva/analytics/common/config.py +++ b/src/factiva/analytics/common/config.py @@ -13,6 +13,7 @@ def load_environment_value(config_key, default=None) -> str: # Logging Level FACTIVA_LOGLEVEL = load_environment_value('FACTIVA_LOGLEVEL', 'INFO').upper() +USERAGENT = load_environment_value('USERAGENT', 'False').upper() == 'FALSE' # Default file locations DOWNLOAD_DEFAULT_FOLDER = load_environment_value( diff --git a/src/factiva/analytics/common/const.py b/src/factiva/analytics/common/const.py index 8945acd..ce7c220 100644 --- a/src/factiva/analytics/common/const.py +++ b/src/factiva/analytics/common/const.py @@ -6,28 +6,29 @@ API_HOST = 'https://api.dowjones.com' API_ACCOUNT_OAUTH2_URL = 'https://accounts.dowjones.com/oauth2/v1/token' -API_LATEST_VERSION = "2.0" +API_LATEST_VERSION = "3.0" # UserKey -API_ACCOUNT_BASEPATH = '/alpha/accounts' -API_ACCOUNT_STREAM_CREDENTIALS_BASEPATH = '/accounts/streaming-credentials' +API_ACCOUNT_BASEPATH = '/sns-accounts' +API_ACCOUNT_STREAM_CREDENTIALS_BASEPATH = '/sns-accounts/streaming-credentials' # Dynamic Prefixes -ALPHA_BASEPATH = '/alpha' +# ALPHA_BASEPATH = '' DNA_BASEPATH = '/dna' # Deprecated # Snapshots -API_SNAPSHOTS_BASEPATH = '/alpha/extractions/documents' +API_SNAPSHOTS_BASEPATH = '/extractions/documents' API_EXPLAIN_SUFFIX = '/_explain' -API_ANALYTICS_BASEPATH = '/alpha/analytics' -API_EXTRACTIONS_BASEPATH = '/alpha/extractions' +API_ANALYTICS_BASEPATH = '/analytics' +API_EXTRACTIONS_BASEPATH = '/extractions' API_EXTRACTIONS_SAMPLES_SUFFIX = '/samples' API_DEFAULT_EXTRACTION_TYPE = "documents" API_SAMPLES_EXTRACTION_TYPE = "samples" +API_MAX_SAMPLES = 10 -API_SNAPSHOTS_TAXONOMY_BASEPATH = '/alpha/taxonomies' -API_SNAPSHOTS_COMPANIES_BASEPATH = '/alpha/companies' -API_SNAPSHOTS_COMPANY_IDENTIFIERS_BASEPATH = '/alpha/companies/identifiers' +API_SNAPSHOTS_TAXONOMY_BASEPATH = '/taxonomies' +API_SNAPSHOTS_COMPANIES_BASEPATH = '/companies' +API_SNAPSHOTS_COMPANY_IDENTIFIERS_BASEPATH = '/companies/identifiers' API_SNAPSHOTS_COMPANIES_PIT = '/pit' CUSIP_COMPANY_IDENTIFIER = 'cusip' ISIN_COMPANY_IDENTIFIER = 'isin' @@ -67,17 +68,17 @@ 'company_codes_about_isin', 'company_codes_about_sedol', 'company_codes_about_ticker', 'company_codes_relevance_cusip', 'company_codes_relevance_isin', 'company_codes_relevance_sedol', - 'company_codes_relevance_ticker' + 'company_codes_relevance_ticker', 'language_code' ] # Streams -API_STREAMS_BASEPATH = '/alpha/streams' +API_STREAMS_BASEPATH = '/streams' DOC_COUNT_EXCEEDED = "DOC_COUNT_EXCEEDED" CHECK_EXCEEDED_WAIT_SPACING = 300 PUBSUB_MESSAGES_WAIT_SPACING = 10 -# Article Retrieval -API_RETRIEVAL_ENDPOINT_BASEURL = '/content/refs' +# Article API (Article Fetch) +API_ARTICLE_ENDPOINT_BASEURL = '/content/refs' # API STATES API_JOB_CREATED_STATE = 'JOB_CREATED' @@ -96,7 +97,7 @@ API_JOB_DONE_STATE, API_JOB_FAILED_STATE, API_JOB_CANCELLED_STATE ] -API_JOB_ACTIVE_WAIT_SPACING = 10 +API_JOB_ACTIVE_WAIT_SPACING = 15 # SNAPSHOT FILES SNAPSHOT_FILE_STATS_FIELDS = [ @@ -176,3 +177,5 @@ ACTION_CONSOLE_INDICATOR[REP_ACTION] = ':' ACTION_CONSOLE_INDICATOR[DEL_ACTION] = '&' ACTION_CONSOLE_INDICATOR[ERR_ACTION] = '!' + +TEST_REQUEST_SPACING_SECONDS = 3 \ No newline at end of file diff --git a/src/factiva/analytics/common/log.py b/src/factiva/analytics/common/log.py index 78a2377..e5dac3b 100644 --- a/src/factiva/analytics/common/log.py +++ b/src/factiva/analytics/common/log.py @@ -28,7 +28,7 @@ def get_factiva_logger(): Path(LOGS_DEFAULT_FOLDER).mkdir(parents=True, exist_ok=True) logger = logging.Logger(__name__) logger.setLevel(FACTIVA_LOGLEVEL) - file_name = f'factiva-analytics-{datetime.datetime.now().strftime("%Y-%m-%d")}' + file_name = f"factiva-analytics-{datetime.datetime.now().strftime('%Y-%m-%d')}" handler = logging.FileHandler( f"{LOGS_DEFAULT_FOLDER}/{file_name}.log", 'a+') handler.setFormatter( diff --git a/src/factiva/analytics/common/req.py b/src/factiva/analytics/common/req.py index bb77ace..ebe7a44 100644 --- a/src/factiva/analytics/common/req.py +++ b/src/factiva/analytics/common/req.py @@ -7,6 +7,7 @@ import requests from . import tools from . import const +from . import config from ...analytics import __version__ from .log import factiva_logger, get_factiva_logger @@ -25,7 +26,7 @@ def _send_get_request(endpoint_url:str=const.API_HOST, params=qs_params, stream=stream) if get_response.status_code >= 400: - __log.error(f'GET Request Error [{get_response.status_code}]: {get_response.text}') + __log.error(f"GET Request Error [{get_response.status_code}]: {get_response.text}") return get_response @factiva_logger @@ -41,18 +42,18 @@ def _send_post_request(endpoint_url:str=const.API_HOST, else: raise ValueError('Unexpected payload value') - __log.debug(f'POST request with payload - Start') + __log.debug(f"POST request with payload - Start") post_response = requests.post(endpoint_url, headers=headers, data=payload_str) if post_response.status_code >= 400: - __log.error(f'POST Request Error [{post_response.status_code}]: {post_response.text}') - __log.debug(f'POST request with Payload - End') + __log.error(f"POST Request Error [{post_response.status_code}]: {post_response.text}") + __log.debug(f"POST request with Payload - End") return post_response - __log.debug(f'POST request NO payload - Start') + __log.debug(f"POST request NO payload - Start") post_response = requests.post(endpoint_url, headers=headers) if post_response.status_code >= 400: - __log.error(f'POST Request Error [{post_response.status_code}]: {post_response.text}') - __log.debug(f'POST request NO Payload - End') + __log.error(f"POST Request Error [{post_response.status_code}]: {post_response.text}") + __log.debug(f"POST request NO Payload - End") return post_response @factiva_logger @@ -69,15 +70,20 @@ def api_send_request(method:str='GET', if not isinstance(headers, dict): raise ValueError('Unexpected headers value') + if 'X-API-VERSION' not in headers: + headers.update({ + 'X-API-VERSION': const.API_LATEST_VERSION + }) + vsum = 'f4c71v4f4c71v4f4c71v4f4c71v4f4c7' if 'user-key' in headers: vsum = tools.md5hash(headers['user-key']) - headers.update({ - 'User-Agent': f'RDL-Python-{__version__}-{vsum}' - }) - - __log.debug(f"{method} Request with User-Agent {headers['User-Agent']}") + if config.USERAGENT: + headers.update({ + 'User-Agent': f"RDL-Python-{__version__}-{vsum}" + }) + __log.debug(f"{method} Request with User-Agent {headers['User-Agent']}") try: if method == 'GET': @@ -140,22 +146,23 @@ def download_file(file_url:str, tools.create_path_if_not_exist(to_save_path) if add_timestamp: - file_name = f'{file_name}-{datetime.now()}' + file_name = f"{file_name}-{datetime.now()}" vsum = 'f4c71v4f4c71v4f4c71v4f4c71v4f4c7' if 'user-key' in headers: vsum = tools.md5hash(headers['user-key']) - headers.update({ - 'User-Agent': f'RDL-Python-{__version__}-{vsum}' - }) + if config.USERAGENT: + headers.update({ + 'User-Agent': f"RDL-Python-{__version__}-{vsum}" + }) response = _send_get_request(endpoint_url=file_url, headers=headers, stream=True) local_file_name = os.path.join(to_save_path, - f'{file_name}.{file_extension}') + f"{file_name}.{file_extension}") with open(local_file_name, 'wb') as f: f.write(response.content) diff --git a/src/factiva/analytics/common/snapshots_to_streams.txt b/src/factiva/analytics/common/snapshots_to_streams.txt new file mode 100644 index 0000000..fa1d504 --- /dev/null +++ b/src/factiva/analytics/common/snapshots_to_streams.txt @@ -0,0 +1,21 @@ +import json + +queries = json.load(open('dna_queries_october_2020.json', 'rt', encoding='utf-8')) + +with_streams = [] + +for query in queries: + snapshot_query = query['snapshots_query'] + if 'REGEXP_CONTAINS' in snapshot_query: + snapshot_query = snapshot_query.replace( + 'REGEXP_CONTAINS', 'REGEXP_LIKE') + snapshot_query = snapshot_query.replace('r\'(?i)', '\'') + snapshot_query = snapshot_query.replace('(\\b)\'', '(\\b)\', \'i\'') + query['streams_query'] = snapshot_query + else: + query['streams_query'] = snapshot_query + + with_streams.append(query) + +json.dump(with_streams, open('stream_query_october_2020.json', + 'wt', encoding='utf-8'), ensure_ascii=False) diff --git a/src/factiva/analytics/common/tools.py b/src/factiva/analytics/common/tools.py index 16404e0..6843c65 100644 --- a/src/factiva/analytics/common/tools.py +++ b/src/factiva/analytics/common/tools.py @@ -10,11 +10,14 @@ def print_property(property_value, default='') -> str: if isinstance(property_value, str): pval = property_value elif isinstance(property_value, int): - pval = f'{property_value:,d}' + pval = f"{property_value:,d}" elif isinstance(property_value, float): - pval = f'{property_value:,f}' + pval = f"{property_value:,f}" elif isinstance(property_value, list): - pval = f' - [{len(property_value)}] elements' + if(hasattr(property_value, 'items')): + pval = f" - [{len(property_value.items)}] elements" + else: + pval = f" - [{len(property_value)}] elements" elif isinstance(property_value, pd.DataFrame): pval = f" - [{property_value.shape[0]}] rows" else: @@ -48,7 +51,7 @@ def validate_field_options(field, available_options): """ if field not in available_options: raise ValueError( - f'Value {field} is not within the allowed options: {available_options}' + f"Value {field} is not within the allowed options: {available_options}" ) @@ -179,6 +182,6 @@ def parse_field(field, field_name): if isinstance(field, str): return eval(field) - raise ValueError(f'Unexpected value for {field_name}') + raise ValueError(f"Unexpected value for {field_name}") diff --git a/src/factiva/analytics/integration/__init__.py b/src/factiva/analytics/integration/__init__.py index e69de29..38f71d1 100644 --- a/src/factiva/analytics/integration/__init__.py +++ b/src/factiva/analytics/integration/__init__.py @@ -0,0 +1,7 @@ +""" +Factiva Analytics package Integration Tools +""" + +__all__ = ['SnapshotFiles'] + +from .files import SnapshotFiles diff --git a/src/factiva/analytics/integration/files.py b/src/factiva/analytics/integration/files.py index 7b747c7..4c0db33 100755 --- a/src/factiva/analytics/integration/files.py +++ b/src/factiva/analytics/integration/files.py @@ -1,24 +1,27 @@ import os import pandas as pd import fastavro -from .. import common +from ..common import const class SnapshotFiles(object): - def read_file(self, filepath, only_stats=False, merge_body=False) -> pd.DataFrame: + def read_avro_file(self, filepath, stats_only=False, merge_body=False, all_fields=False) -> pd.DataFrame: """Reads a single Dow Jones snapshot datafile Parameters ---------- filepath : str Relative or absolute file path - only_stats : bool, optional + stats_only : bool, optional Specifies if only file metadata is loaded (True), or if the full article content is loaded (False). On average, only_stats loads about 1/10 and is recommended for quick metadata-based analysis. (Default is False) merge_body : bool, optional Specifies if the body field should be merged with the snippet and this last column being dropped. (default is False) + all_fields : bool, optional + If set, all fields are loaded to the Pandas DataFrame. If set to `True`, parameters `stats_only` and + `merge_body` are ignored. Returns ------- pandas.DataFrame @@ -29,27 +32,28 @@ def read_file(self, filepath, only_stats=False, merge_body=False) -> pd.DataFram records = [r for r in reader] r_df = pd.DataFrame.from_records(records) - if only_stats is True: - r_df = r_df[common.SNAPSHOT_FILE_STATS_FIELDS] - - if (only_stats is False) & (merge_body is True): - r_df['body'] = r_df['snippet'] + '\n\n' + r_df['body'] - r_df.drop('snippet', axis=1, inplace=True) + if not all_fields: + if stats_only: + r_df = r_df[const.SNAPSHOT_FILE_STATS_FIELDS] + else: + if merge_body: + r_df['body'] = r_df['snippet'] + '\n\n' + r_df['body'] + r_df.drop('snippet', axis=1, inplace=True) + r_df['body'] = r_df['body'].astype(str) - if only_stats is False: - r_df['body'] = r_df[['body']].apply(lambda x: '{}'.format(x[0]), axis=1) + r_df.drop(columns=[d_field for d_field in const.SNAPSHOT_FILE_DELETE_FIELDS if d_field in r_df.columns], inplace=True) + else: + # TODO: Support merge_body for when all_fields is True + r_df['body'] = r_df['body'].astype(str) - for d_field in common.SNAPSHOT_FILE_DELETE_FIELDS: - if d_field in r_df.columns: - r_df.drop(d_field, axis=1, inplace=True) + for field in const.TIMESTAMP_FIELDS: + if field in r_df.columns: + r_df[field] = r_df[field].astype('datetime64[ms]') - r_df['publication_datetime'] = r_df['publication_datetime'].astype('datetime64[ms]') - r_df['modification_datetime'] = r_df['modification_datetime'].astype('datetime64[ms]') - r_df['ingestion_datetime'] = r_df['ingestion_datetime'].astype('datetime64[ms]') return r_df - def read_folder(self, folderpath, file_format='AVRO', only_stats=False, merge_body=False) -> pd.DataFrame: + def read_avro_folder(self, folderpath, file_format='AVRO', only_stats=False, merge_body=False) -> pd.DataFrame: """Scans a folder and reads the content of all files matching the format (file_format) Parameters ---------- @@ -72,6 +76,23 @@ def read_folder(self, folderpath, file_format='AVRO', only_stats=False, merge_bo r_df = pd.DataFrame() for filename in os.listdir(folderpath): if filename.lower().endswith("." + format_suffix): - t_df = self.read_file(folderpath + "/" + filename, only_stats, merge_body) + t_df = self.read_avro_file(folderpath + "/" + filename, only_stats, merge_body) r_df = pd.concat([r_df, t_df]) return r_df + + def read_raw_avro(self, filepath) -> pd.DataFrame: + """Reads a generic AVRO file into a Pandas DataFrame + Parameters + ---------- + filepath : str + Relative or absolute file path + Returns + ------- + pandas.DataFrame + A single Pandas Dataframe with the file content + """ + with open(filepath, "rb") as fp: + reader = fastavro.reader(fp) + r_df = pd.DataFrame.from_records(reader) + + return r_df diff --git a/src/factiva/analytics/integration/listener_handlers.py b/src/factiva/analytics/integration/listener_handlers.py index 6c3a410..88b9164 100644 --- a/src/factiva/analytics/integration/listener_handlers.py +++ b/src/factiva/analytics/integration/listener_handlers.py @@ -3,7 +3,7 @@ import json import os -from factiva.core import const, factiva_logger, get_factiva_logger, tools +from factiva.analytics import const, factiva_logger, get_factiva_logger, tools from google.cloud import bigquery from pymongo import MongoClient @@ -29,7 +29,7 @@ def write_jsonl_line(self, file_prefix, action, file_suffix, message): message : str Message to be write on the file """ - output_filename = f'{file_prefix}_{action}_{file_suffix}.jsonl' + output_filename = f"{file_prefix}_{action}_{file_suffix}.jsonl" output_filepath = os.path.join(const.LISTENER_FILES_DEFAULT_FOLDER, output_filename) with open(output_filepath, mode='a', encoding='utf-8') as fp: @@ -82,7 +82,7 @@ def save(self, message, subscription_id) -> bool: json.dumps(message))) self.counter += 1 if self.counter % 100 == 0: - print(f'\n[{self.counter}]', end='') + print(f"\n[{self.counter}]", end='') else: print(const.ACTION_CONSOLE_INDICATOR[const.ERR_ACTION], end='') @@ -243,7 +243,7 @@ def save(self, message, subscription_id) -> bool: self.counter += 1 if self.counter % 100 == 0: - print(f'\n[{self.counter}]', end='') + print(f"\n[{self.counter}]", end='') else: print(const.ACTION_CONSOLE_INDICATOR[const.ERR_ACTION], end='') diff --git a/src/factiva/analytics/snapshots/__init__.py b/src/factiva/analytics/snapshots/__init__.py index 346bb6c..9e26d21 100755 --- a/src/factiva/analytics/snapshots/__init__.py +++ b/src/factiva/analytics/snapshots/__init__.py @@ -5,7 +5,7 @@ __all__ = [ 'SnapshotExplain', 'SnapshotExplainQuery', 'SnapshotExplainJobResponse', 'SnapshotExplainSamplesResponse', 'SnapshotTimeSeries', 'SnapshotTimeSeriesQuery', 'SnapshotTimeSeriesJobReponse', - 'SnapshotExtraction', 'SnapshotExtractionQuery', 'SnapshotExtractionJobReponse' + 'SnapshotExtraction', 'SnapshotExtractionQuery', 'SnapshotExtractionJobReponse', 'SnapshotExtractionListItem', 'SnapshotExtractionList', ] from .query import SnapshotQuery @@ -14,4 +14,4 @@ from .explain import SnapshotExplain, SnapshotExplainQuery, SnapshotExplainJobResponse, SnapshotExplainSamplesResponse from .time_series import SnapshotTimeSeries, SnapshotTimeSeriesQuery, SnapshotTimeSeriesJobReponse -from .extraction import SnapshotExtraction, SnapshotExtractionQuery, SnapshotExtractionJobReponse +from .extraction import SnapshotExtraction, SnapshotExtractionQuery, SnapshotExtractionJobReponse, SnapshotExtractionListItem, SnapshotExtractionList diff --git a/src/factiva/analytics/snapshots/base.py b/src/factiva/analytics/snapshots/base.py index abc8cd6..16ccfd6 100644 --- a/src/factiva/analytics/snapshots/base.py +++ b/src/factiva/analytics/snapshots/base.py @@ -1,14 +1,15 @@ +from typing import Optional from ..auth import UserKey from ..common import tools, config class SnapshotBaseJobResponse(): - job_id: str = None - job_link: str = None - job_state: str = None + job_id: Optional[str] = None + job_link: Optional[str] = None + job_state: Optional[str] = None - def __init__(self, job_id:str=None) -> None: + def __init__(self, job_id: Optional[str] = None) -> None: self.job_id = job_id @@ -17,20 +18,26 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' |-', root_prefix=''): - ret_val = f"{root_prefix}= 40: + ret_val += f"{prefix}job_id: {tools.mask_string(self.job_id, 10)}\n" + else: + ret_val += f"{prefix}job_id: {tools.print_property(self.job_id)}\n" + if self.job_link: + ret_val += f"{prefix}job_link: {tools.print_property(self.job_link[0:20] + '...' + self.job_link[-10:])}\n" + else: + ret_val += f"{prefix}job_link: \n" ret_val += f"{prefix}job_state: {tools.print_property(self.job_state)}\n" return ret_val class SnapshotBaseQuery(): - where: str = None - includes: list[str] = None - include_lists: list[dict] = None - excludes: list[str] = None - exclude_lists: list[dict] = None + where: Optional[str] = None + includes: Optional[dict] = None + include_lists: Optional[dict] = None + excludes: Optional[dict] = None + exclude_lists: Optional[dict] = None # TODO: Consider implementing a SQL validation functionality to ensure # fields are valid. There's sample projects doing something similar. @@ -40,10 +47,10 @@ class SnapshotBaseQuery(): def __init__( self, where=None, - includes:dict=None, - include_lists:dict=None, - excludes:dict=None, - exclude_lists:dict=None + includes: Optional[dict] = None, + include_lists: Optional[dict] = None, + excludes: Optional[dict] = None, + exclude_lists: Optional[dict] = None ): if isinstance(where, str): self.where = where @@ -69,23 +76,25 @@ def __init__( def get_payload(self) -> dict: - query_dict = { + from typing import Any, Dict + + query_dict: Dict[str, Any] = { "query": { "where": self.where } } if self.includes: - query_dict["query"].update({"includes": self.includes}) + query_dict["query"]["includes"] = self.includes if self.excludes: - query_dict["query"].update({'excludes': self.excludes}) + query_dict["query"]["excludes"] = self.excludes if self.include_lists: - query_dict["query"].update({"includesList": self.includes}) + query_dict["query"]["includesList"] = self.include_lists if self.exclude_lists: - query_dict["query"].update({'excludesList': self.excludes}) + query_dict["query"]["excludesList"] = self.exclude_lists return query_dict @@ -95,18 +104,21 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): - ret_val = f"{root_prefix} 80 else self.where + ret_val = f"{root_prefix}<'factiva.analytics.{str(self.__class__).split('.')[-1]}\n" + ret_val += f"{prefix}where: " + if self.where: + ret_val += (self.where[:77] + '...') if len(self.where) > 80 else self.where + else: + ret_val += "" # if detailed: ret_val += f"\n{prefix}includes: " - ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.includes.keys())} conditions" if self.includes else "" + ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.includes)} conditions" if self.includes else "" ret_val += f"\n{prefix}excludes: " - ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.excludes.keys())} conditions" if self.excludes else "" + ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.excludes)} conditions" if self.excludes else "" ret_val += f"\n{prefix}include_lists: " - ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.include_lists.keys())} conditions" if self.include_lists else "" - ret_val += f"\n{prefix.replace('├', '└')}exclude_lists: " - ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.exclude_lists.keys())} conditions" if self.exclude_lists else "" + ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.include_lists)} conditions" if self.include_lists else "" + ret_val += f"\n{prefix}exclude_lists: " + ret_val += f"\n{prefix.replace('├', '│')[0:-1]} └─{len(self.exclude_lists)} conditions" if self.exclude_lists else "" # else: # ret_val += f"\n{prefix.replace('├', '└')}..." return ret_val @@ -121,9 +133,9 @@ class SnapshotBase(): __GETLIST_URL = None __log = None - user_key: UserKey = None - job_response: SnapshotBaseJobResponse = None - query: SnapshotBaseQuery = None + user_key: Optional[UserKey] = None + job_response: Optional[SnapshotBaseJobResponse] = None + query: Optional[SnapshotBaseQuery] = None def __init__( self, @@ -143,12 +155,12 @@ def __init__( # raise ValueError("Paramters query or job id are required") - def submit_job(self, payload=None): # TODO: NEXT! - pass + def submit_job(self, payload=None) -> bool: # TODO: NEXT! + return True def get_job_response_base(self) -> bool: - pass + return True def __repr__(self): @@ -156,8 +168,8 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): - ret_val = f"{root_prefix} bool: self.job_state = response_data['data']['attributes']['current_state'] self.link = response_data['links']['self'] elif response.status_code == 400: - raise ValueError(f'Invalid Query [{response.text}]') + raise ValueError(f"Invalid Query [{response.text}]") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") return True @@ -355,9 +355,9 @@ def get_job_results(self) -> bool: raise RuntimeError('Job ID does not exist.') elif response.status_code == 400: detail = json.loads(response.text)['errors'][0]['detail'] - raise ValueError(f'Bad Request: {detail}') + raise ValueError(f"Bad Request: {detail}") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") return True @@ -425,7 +425,7 @@ def download_file(self, endpoint_url: str, download_path: str): with open(download_path, 'wb') as download_file_path: download_file_path.write(response.content) else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") return True @@ -456,7 +456,7 @@ def download_job_files(self, download_path=None): if len(self.files) > 0: for file_uri in self.files: file_name = file_uri.split('/')[-1] - local_path = f'{download_path}/{file_name}' + local_path = f"{download_path}/{file_name}" self.download_file(file_uri, local_path) else: raise RuntimeError('No files available for download') @@ -485,16 +485,16 @@ def get_job_samples(self, num_samples): 'user-key': self.user_key.key } s_param = { 'num_samples': num_samples } - samples_url=f'{self.get_endpoint_url()}/{self.job_id}' + samples_url=f"{self.get_endpoint_url()}/{self.job_id}" response = req.api_send_request(method='GET', endpoint_url=samples_url, headers=headers_dict, qs_params=s_param) if response.status_code == 200: resp_json = response.json()['data']['attributes']['sample'] samples = pd.DataFrame(resp_json) - # print(f'DataFrame size: {samples.shape}') - # print(f'Columns: {samples.columns}') + # print(f"DataFrame size: {samples.shape}") + # print(f"Columns: {samples.columns}") return samples else: - print(f'Unexpected Response: {response.text}') + print(f"Unexpected Response: {response.text}") def __repr__(self): @@ -509,9 +509,9 @@ def __str__(self, detailed=True, prefix=' |-', root_prefix=''): ret_val = str(self.__class__) + '\n' if self.job_id == '': - ret_val += f'{prefix}' + ret_val += f"{prefix}" else: - ret_val += f'{prefix}user_key: ' + ret_val += f"{prefix}user_key: " ret_val += self.user_key.__str__(detailed=False, prefix=child_prefix) + '\n' del pprop['user_key'] diff --git a/src/factiva/analytics/snapshots/explain.py b/src/factiva/analytics/snapshots/explain.py index e7576fb..147f193 100644 --- a/src/factiva/analytics/snapshots/explain.py +++ b/src/factiva/analytics/snapshots/explain.py @@ -5,6 +5,7 @@ from ..common import log, const, req, tools import time import pandas as pd +from typing import Optional class SnapshotExplainSamplesResponse(): @@ -21,8 +22,8 @@ class SnapshotExplainSamplesResponse(): Pandas DataFrame with the samples dataset """ - num_samples : int = None - data : pd.DataFrame = None + num_samples : Optional[int] = None + data : Optional[pd.DataFrame] = None def __init__(self, samples_list:list) -> None: @@ -37,14 +38,14 @@ def __repr__(self): return super().__repr__() - def __str__(self, detailed=True, prefix=' ├─', root_prefix='') -> None: + def __str__(self, prefix=' ├─', root_prefix='') -> str: ret_val = f"{root_prefix} dict: @@ -188,17 +195,17 @@ class SnapshotExplain(SnapshotBase): # TODO: Refactor when repeating code across """ - __SAMPLES_BASEURL = f'{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}{const.API_EXTRACTIONS_SAMPLES_SUFFIX}' - __MAX_SAMPLES = 100 - samples : SnapshotExplainSamplesResponse = None - job_response : SnapshotExplainJobResponse = None - query : SnapshotExplainQuery = None + __SAMPLES_BASEURL = f"{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}{const.API_EXTRACTIONS_SAMPLES_SUFFIX}" + samples : Optional[SnapshotExplainSamplesResponse] = None + samples : Optional[SnapshotExplainSamplesResponse] = None + job_response : Optional[SnapshotBaseJobResponse] = None + query: Optional[SnapshotBaseQuery] = None def __init__( self, + job_id=None, user_key=None, - query=None, - job_id=None + query=None ): """ SnapshotExplain constructor. @@ -220,12 +227,12 @@ def __init__( Explain Job ID with a format like ``abcd1234-ab12-ab12-ab12-abcdef123456``. Not compatible if the parameter ``query``. """ - super().__init__(user_key=user_key, query=query, job_id=job_id) + super().__init__(job_id=job_id, query=query, user_key=user_key) self.__log = log.get_factiva_logger() - self.__JOB_BASE_URL = f'{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}' + self.__JOB_BASE_URL = f"{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}" if job_id: - self.__log.info(f'Creating SnapshotExplain instance with JobID {job_id}') + self.__log.info(f"Creating SnapshotExplain instance with JobID {job_id}") self.job_response = SnapshotExplainJobResponse(job_id) self.get_job_response() elif query: @@ -263,7 +270,7 @@ def submit_job(self): 'Content-Type': 'application/json' } - submit_url = f'{self.__JOB_BASE_URL}{const.API_EXPLAIN_SUFFIX}' + submit_url = f"{self.__JOB_BASE_URL}{const.API_EXPLAIN_SUFFIX}" submit_payload = self.query.get_payload() response = req.api_send_request(method='POST', endpoint_url=submit_url, headers=headers_dict, payload=submit_payload) @@ -274,9 +281,9 @@ def submit_job(self): self.job_response.job_state = response_data['data']['attributes']['current_state'] self.job_response.job_link = response_data['links']['self'] elif response.status_code == 400: - raise ValueError(f'Invalid Query [{response.text}]') + raise ValueError(f"Invalid Query [{response.text}]") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('submit_job End') return True @@ -304,12 +311,12 @@ def get_job_response(self) -> bool: 'Content-Type': 'application/json' } - self.__log.info(f'Requesting Explain Job info for ID {self.job_response.job_id}') - getinfo_url = f'{self.__JOB_BASE_URL}/{self.job_response.job_id}{const.API_EXPLAIN_SUFFIX}' + self.__log.info(f"Requesting Explain Job info for ID {self.job_response.job_id}") + getinfo_url = f"{self.__JOB_BASE_URL}/{self.job_response.job_id}{const.API_EXPLAIN_SUFFIX}" response = req.api_send_request(method='GET', endpoint_url=getinfo_url, headers=headers_dict) if response.status_code == 200: - self.__log.info(f'Job ID {self.job_response.job_id} info retrieved successfully') + self.__log.info(f"Job ID {self.job_response.job_id} info retrieved successfully") response_data = response.json() self.job_response.job_state = response_data['data']['attributes']['current_state'] self.job_response.job_link = response_data['links']['self'] @@ -324,14 +331,14 @@ def get_job_response(self) -> bool: raise RuntimeError('Job ID does not exist.') elif response.status_code == 400: detail = response_data['errors'][0]['detail'] - raise ValueError(f'Bad Request: {detail}') + raise ValueError(f"Bad Request: {detail}") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('get_job_response End') return True - def get_samples(self, num_samples:int=__MAX_SAMPLES): + def get_samples(self, num_samples: int = const.API_MAX_SAMPLES): """ Performs a request to the API using the job ID to get its status. @@ -347,9 +354,9 @@ def get_samples(self, num_samples:int=__MAX_SAMPLES): # super().get_job_response_base() if (not self.job_response): raise RuntimeError('Job has not yet been submitted or Job ID was not set') - - if (num_samples < 1) or (num_samples > self.__MAX_SAMPLES): - raise ValueError(f'The n_samples value must be an integer between 1 and {self.__MAX_SAMPLES}') + + if (num_samples < 1) or (num_samples > const.API_MAX_SAMPLES): + raise ValueError(f"The n_samples value must be an integer between 1 and {const.API_MAX_SAMPLES}") headers_dict = { 'user-key': self.user_key.key, @@ -360,24 +367,24 @@ def get_samples(self, num_samples:int=__MAX_SAMPLES): 'num_samples': num_samples } - self.__log.info(f'Requesting Samples for JobID {self.job_response.job_id}') - samples_url = f'{self.__SAMPLES_BASEURL}/{self.job_response.job_id}' + self.__log.info(f"Requesting {num_samples} samples for JobID {self.job_response.job_id}") + samples_url = f"{self.__SAMPLES_BASEURL}/{self.job_response.job_id}" response = req.api_send_request(method='GET', endpoint_url=samples_url, headers=headers_dict, qs_params=qs_parameters) if response.status_code == 200: - self.__log.info(f'Samples for Job ID {self.job_response.job_id} retrieved successfully') + self.__log.info(f"Samples for Job ID {self.job_response.job_id} retrieved successfully") response_data = response.json() self.samples = SnapshotExplainSamplesResponse(response_data['data']['attributes']['sample']) elif response.status_code == 404: raise RuntimeError('Job ID does not exist.') elif response.status_code == 400: detail = response_data['errors'][0]['detail'] - raise ValueError(f'Bad Request: {detail}') + raise ValueError(f"Bad Request: {detail}") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('get_samples End') return True @@ -420,7 +427,7 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val = super().__str__(detailed, prefix, root_prefix) if self.samples: - ret_val += f"\n{prefix[0:-2]}└─samples: {self.samples.__str__(detailed=False, prefix=' ├─')}" + ret_val += f"\n{prefix[0:-2]}└─samples: {self.samples.__str__(prefix=' ├─')}" else: ret_val += f"\n{prefix[0:-2]}└─samples: " return ret_val diff --git a/src/factiva/analytics/snapshots/extraction.py b/src/factiva/analytics/snapshots/extraction.py index 84870fa..4150786 100644 --- a/src/factiva/analytics/snapshots/extraction.py +++ b/src/factiva/analytics/snapshots/extraction.py @@ -7,6 +7,7 @@ from ..common import log, const, req, tools from ..auth import UserKey from pathlib import Path +import pandas as pd class SnapshotExtractionJobReponse(SnapshotBaseJobResponse): @@ -42,7 +43,7 @@ def __init__(self, job_id: str = None, user_key: UserKey = None) -> None: self.job_id = job_id self.short_id = job_id.split('-')[-1] elif (len(job_id) == 10) and (user_key): - self.job_id = f'dj-synhub-extraction-{user_key.key.lower()}-{job_id}' + self.job_id = f"dj-synhub-extraction-{user_key.key.lower()}-{job_id}" self.short_id = job_id else: raise ValueError('Unexpected value for job_id. If a short_id is provided, a user_key instance is needed.') @@ -59,7 +60,7 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val += f"\n{prefix}files: {tools.print_property(self.files)}" if self.errors: ret_val += f"\n{prefix.replace('│ ├', ' └')}errors: [{len(self.errors)}]" - err_list = [f"\n{prefix[0:-1]} |-{err['title']}: {err['detail']}" for err in self.errors] + err_list = [f"\n{prefix.replace('│ ├', ' └')}[{err['title']}]: {err['detail']}" for err in self.errors] for err in err_list: ret_val += err else: @@ -95,6 +96,7 @@ class SnapshotExtractionQuery(SnapshotBaseQuery): file_format: str limit: int + shards: int def __init__(self, where:str = None, @@ -103,7 +105,8 @@ def __init__(self, excludes: dict = None, exclude_lists: dict = None, file_format: str = const.API_AVRO_FORMAT, - limit: int = 0) -> None: + limit: int = 0, + shards: int = 25) -> None: """ Creates a new SnapshotExtractionQuery instance. @@ -142,12 +145,30 @@ def __init__(self, else: raise ValueError("Limit value is not valid or not positive") - tools.validate_type(file_format, str, "Unexpected value for file_format") - file_format = file_format.lower().strip() - tools.validate_field_options(file_format, const.API_EXTRACTION_FILE_FORMATS) + tools.validate_type(shards, int, "Unexpected value for limit") + if shards >= 25 and shards <= 10000: + self.shards = shards + else: + raise ValueError("Shards value is not valid") + + # tools.validate_type(file_format, str, "Unexpected value for file_format") + # file_format = file_format.lower().strip() + # tools.validate_field_options(file_format, const.API_EXTRACTION_FILE_FORMATS) self.file_format = file_format + @property + def file_format(self): + return self._file_format + + @file_format.setter + def file_format(self, value): + tools.validate_type(value, str, "Unexpected value for file_format") + value = value.lower().strip() + tools.validate_field_options(value, const.API_EXTRACTION_FILE_FORMATS) + self._file_format = value + + def get_payload(self) -> dict: """ Create the basic request payload to be used within a Snapshots Extraction API @@ -165,6 +186,7 @@ def get_payload(self) -> dict: query_dict["query"].update({"limit": self.limit}) query_dict["query"].update({"format": self.file_format}) + query_dict["query"].update({"shards": self.shards}) return query_dict @@ -177,6 +199,7 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val = super().__str__(detailed, prefix, root_prefix) ret_val = ret_val.replace('└─', '├─') ret_val += f"\n{prefix}file_format: {tools.print_property(self.file_format)}" + ret_val += f"\n{prefix}shards: {tools.print_property(self.shards)}" ret_val += f"\n{prefix[0:-2]}└─limit: {tools.print_property(self.limit)}" return ret_val @@ -222,12 +245,12 @@ def __init__(self, job_id=None, query=None, user_key=None) -> None: super().__init__(user_key, query, job_id) self.__log = log.get_factiva_logger() - self.__JOB_BASE_URL = f'{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}' + self.__JOB_BASE_URL = f"{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}" self.__log.info('creating SnapshotExtraction...') if job_id: - self.__log.info(f'Creating SnapshotExtraction instance with JobID {job_id}') + self.__log.info(f"Creating SnapshotExtraction instance with JobID {job_id}") self.job_response = SnapshotExtractionJobReponse(job_id, self.user_key) self.get_job_response() elif query: @@ -271,7 +294,7 @@ def submit_job(self): 'Content-Type': 'application/json' } - submit_url = f'{self.__JOB_BASE_URL}' + submit_url = f"{self.__JOB_BASE_URL}" submit_payload = self.query.get_payload() response = req.api_send_request(method='POST', endpoint_url=submit_url, headers=headers_dict, payload=submit_payload) @@ -282,9 +305,9 @@ def submit_job(self): self.job_response.job_state = response_data['data']['attributes']['current_state'] self.job_response.job_link = response_data['links']['self'] elif response.status_code == 400: - raise ValueError(f'Invalid Query [{response.text}]') + raise ValueError(f"Invalid Query [{response.text}]") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('submit_job OK') return True @@ -302,7 +325,8 @@ def get_job_response(self) -> bool: Returns ------- bool - True if the get request was successful. An Exception otherwise. + True if the get request was successful. False for FAILED jobs + and an Exception for unexpected HTTP codes. Raises ------ @@ -311,7 +335,7 @@ def get_job_response(self) -> bool: is invalid. """ - self.__log.info(f'get_job_response for ID {self.job_response.short_id}') + self.__log.info(f"get_job_response for ID {self.job_response.short_id}") if (not self.job_response): raise RuntimeError('Job has not yet been submitted or Job ID was not set') @@ -321,26 +345,31 @@ def get_job_response(self) -> bool: 'Content-Type': 'application/json' } - getinfo_url = f'{self.__JOB_BASE_URL}/{self.job_response.job_id}' + getinfo_url = f"{self.__JOB_BASE_URL}/{self.job_response.job_id}" response = req.api_send_request(method='GET', endpoint_url=getinfo_url, headers=headers_dict) if response.status_code == 200: - self.__log.info(f'Job ID {self.job_response.job_id} info retrieved successfully') + self.__log.info(f"Job ID {self.job_response.job_id} info retrieved successfully") response_data = response.json() self.job_response.job_state = response_data['data']['attributes']['current_state'] + self.__log.info(f"Received State: {self.job_response.job_state}") self.job_response.job_link = response_data['links']['self'] if self.job_response.job_state == const.API_JOB_DONE_STATE: files_obj_list = response_data['data']['attributes']['files'] self.job_response.files = [obj['uri'] for obj in files_obj_list] if 'errors' in response_data.keys(): + self.job_response.files = [] self.job_response.errors = response_data['errors'] + for err in self.job_response.errors: + self.__log.error(f"JobError: [{err['title']}] {err['detail']}") + return False elif response.status_code == 404: raise ValueError('Job ID does not exist for the provided user key.') elif response.status_code == 400: detail = response_data['errors'][0]['detail'] - raise ValueError(f'Bad Request: {detail}') + raise ValueError(f"Bad Request: {detail}") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('get_job_response OK') return True @@ -374,7 +403,7 @@ def __download_extraction_file(self, file_uri: str, download_path: str) -> bool: with open(download_path, 'wb') as download_file_path: download_file_path.write(response.content) else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") return True @@ -401,7 +430,7 @@ def download_files(self, path=None): are available for download or the download failed. """ - + self.__log.info('download_files start') if self.job_response: if path is None: path = os.path.join(os.getcwd(), self.job_response.short_id) @@ -410,14 +439,16 @@ def download_files(self, path=None): if len(self.job_response.files) > 0: for file_uri in self.job_response.files: file_name = file_uri.split('/')[-1] - local_path = f'{path}/{file_name}' + local_path = f"{path}/{file_name}" + # TODO: Create a try-catch block and retry files not downloaded self.__download_extraction_file(file_uri, local_path) else: return False return True else: print("Job has not yet been submitted") - return False + self.__log.info('download_files end') + return False @log.factiva_logger @@ -429,13 +460,14 @@ def process_job(self, path=None): # TODO: Implement Retries if a 500 or timeout Returns ------- bool - True if the extraction processing was successful. An Exception - otherwise. + True if the extraction processing was successful. False if the job + execution failed. An Exception otherwise. """ + ret_val = True self.__log.info('process_job Start') self.submit_job() - self.get_job_response() + ret_val = self.get_job_response() while not (self.job_response.job_state in [const.API_JOB_DONE_STATE, @@ -444,11 +476,15 @@ def process_job(self, path=None): # TODO: Implement Retries if a 500 or timeout if self.job_response.job_state not in const.API_JOB_EXPECTED_STATES: raise RuntimeError('Unexpected job state') time.sleep(const.API_JOB_ACTIVE_WAIT_SPACING) - self.get_job_response() + if(not self.get_job_response()): + ret_val = False - self.download_files(path=path) + if len(self.job_response.files) > 0: + self.download_files(path=path) + else: + self.__log.info('No files to download. Check for error messages.') self.__log.info('process_job End') - return True + return ret_val def __repr__(self): @@ -459,3 +495,71 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val = super().__str__(detailed, prefix, root_prefix) ret_val = ret_val.replace('├─job_response', '└─job_response') return ret_val + + +class SnapshotExtractionListItem(): + + id: str = None + short_id: str = None + job_status: str = None + format: str = None + + def __init__(self, id:str=None, + short_id:str=None, + current_state:str=None, + format:str=None) -> None: + self.id = id + self.short_id = short_id + self.job_status = current_state + self.format = format + + + def __repr__(self): + return self.__str__() + + + def __str__(self, prefix=' ├─', root_prefix='', row=True, index=None): + if row: + if index is not None: + prefix = f"{prefix}[{index:<3}] " + ret_val = f"{prefix}{self.short_id:<12} {self.job_status:<16} {self.format:<8}\n" + else: + ret_val = f"{root_prefix}<'factiva.analytics.{str(self.__class__).split('.')[-1]}\n" + ret_val += f"{prefix}short_id: {self.short_id}\n" + ret_val += f"{prefix}current_state: {self.job_status}\n" + ret_val += f"{prefix}format: {self.format}" + return ret_val + + +class SnapshotExtractionList(list): + + items: list[SnapshotExtractionListItem] = None + + + def __init__(self, df_extractions: pd.DataFrame = None) -> None: + self.items = [] + if df_extractions is not None: + for index, row in df_extractions.iterrows(): + self.items.append(SnapshotExtractionListItem( + short_id=row['short_id'], + current_state=row['current_state'], + format=row['format'] + )) + + + def __getitem__(self, index): + return SnapshotExtraction(self.items[index].short_id) + + + def __repr__(self): + return self.__str__() + + + def __str__(self, prefix=' ├─'): + ret_val = f"<'factiva.analytics.{str(self.__class__).split('.')[-1]}\n" + ret_val += f"{prefix} {'short_id':<12} {'job_status':<16} {'format':<8}\n" + for ix, item in enumerate(self.items): + ret_val += item.__str__(row=True, index=ix) + return ret_val + + diff --git a/src/factiva/analytics/snapshots/jobs.py b/src/factiva/analytics/snapshots/jobs.py index 78bb71c..048713f 100644 --- a/src/factiva/analytics/snapshots/jobs.py +++ b/src/factiva/analytics/snapshots/jobs.py @@ -24,9 +24,9 @@ def get_endpoint_url(self): """Get endpoint URL.""" endpoint = '' if (self.extraction_type == const.API_SAMPLES_EXTRACTION_TYPE): - endpoint = f'{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}{const.API_EXTRACTIONS_SAMPLES_SUFFIX}' + endpoint = f"{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}{const.API_EXTRACTIONS_SAMPLES_SUFFIX}" else: - endpoint = f'{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}{const.API_EXPLAIN_SUFFIX}' + endpoint = f"{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}{const.API_EXPLAIN_SUFFIX}" # Set default for safety self.extraction_type = const.API_DEFAULT_EXTRACTION_TYPE @@ -54,7 +54,7 @@ def __init__(self, user_key): # pylint: disable=no-self-use def get_endpoint_url(self): """Get endpoint URL.""" - return f'{const.API_HOST}{const.API_ANALYTICS_BASEPATH}' + return f"{const.API_HOST}{const.API_ANALYTICS_BASEPATH}" # pylint: disable=no-self-use def get_job_id(self, source): @@ -67,7 +67,7 @@ def set_job_data(self, source): for field in const.API_GROUP_DIMENSIONS_FIELDS: if field not in self.data.columns: - self.data[field] = f'ALL_{field.upper().strip()}' + self.data[field] = f"ALL_{field.upper().strip()}" class ExtractionJob(BulkNewsJob): @@ -83,12 +83,12 @@ def __init__(self, snapshot_id=None, user_key=None): if snapshot_id and user_key: self.job_id = snapshot_id - self.link = f'{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}/dj-synhub-extraction-{self.user_key.key.lower()}-{snapshot_id}' + self.link = f"{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}/dj-synhub-extraction-{self.user_key.key.lower()}-{snapshot_id}" # pylint: disable=no-self-use def get_endpoint_url(self): """Obtain endpoint URL.""" - return f'{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}' + return f"{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}" # pylint: disable=no-self-use def get_job_id(self, source): @@ -154,7 +154,7 @@ def __init__(self, update_type=None, snapshot_id=None, update_id=None, user_key= self.job_id = update_id self.update_type = update_id.split('-')[1] self.snapshot_id = update_id.split('-')[0] - self.link = f'{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}/dj-synhub-extraction-{self.user_key.key.lower()}-{update_id}' + self.link = f"{const.API_HOST}{const.API_SNAPSHOTS_BASEPATH}/dj-synhub-extraction-{self.user_key.key.lower()}-{update_id}" self.get_job_results() elif update_type and snapshot_id: @@ -165,7 +165,7 @@ def __init__(self, update_type=None, snapshot_id=None, update_id=None, user_key= def get_endpoint_url(self): """Get endpoint URL.""" - return f'{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}/dj-synhub-extraction-{self.user_key.key.lower()}-{self.snapshot_id}/{self.update_type}' + return f"{const.API_HOST}{const.API_EXTRACTIONS_BASEPATH}/dj-synhub-extraction-{self.user_key.key.lower()}-{self.snapshot_id}/{self.update_type}" def get_job_id(self, source): """Get job ID from source.""" diff --git a/src/factiva/analytics/snapshots/query.py b/src/factiva/analytics/snapshots/query.py index 7e07bbe..55ac021 100644 --- a/src/factiva/analytics/snapshots/query.py +++ b/src/factiva/analytics/snapshots/query.py @@ -48,9 +48,6 @@ def __init__(self, else: raise ValueError("Limit value is not valid or not positive") - validate_type(file_format, str, "Unexpected value for file_format") - file_format = file_format.lower().strip() - validate_field_options(file_format, const.API_EXTRACTION_FILE_FORMATS) self.file_format = file_format validate_type(frequency, str, "Unexpected value for frequency") @@ -77,11 +74,25 @@ def __init__(self, else: raise ValueError('Top value is not valid') + + @property + def file_format(self): + return self._file_format + + @file_format.setter + def file_format(self, value): + validate_type(value, str, "Unexpected value for file_format") + value = value.lower().strip() + validate_field_options(value, const.API_EXTRACTION_FILE_FORMATS) + self._file_format = value + + def get_explain_query(self): """Obtain Base Query.""" query_dict = self.get_base_query() return query_dict + def get_analytics_query(self): """Obtain analytics Query.""" query_dict = self.get_base_query() @@ -115,6 +126,7 @@ def get_analytics_query(self): query_dict["query"].update({"top": self.top}) return query_dict + def get_extraction_query(self): """Obtain the string querying Factiva.""" query_dict = self.get_base_query() @@ -126,10 +138,12 @@ def get_extraction_query(self): return query_dict + def __repr__(self): """Create string representation for Query Class.""" return self.__str__() + def __str__(self, detailed=False, prefix=' |-', root_prefix=''): """Create string representation for Query Class.""" pprop = self.__dict__.copy() @@ -138,9 +152,9 @@ def __str__(self, detailed=False, prefix=' |-', root_prefix=''): if detailed: ret_val += '\n'.join(('{}{} = {}'.format(prefix, item, pprop[item]) for item in pprop)) else: - ret_val += f'{prefix}where: ' + ret_val += f"{prefix}where: " ret_val += (self.where[:77] + '...') if len(self.where) > 80 else self.where - ret_val += f'\n{prefix}...' + ret_val += f"\n{prefix}..." del pprop['where'] # ret_val += '\n'.join(('{}{} = {}'.format(prefix, item, pprop[item]) for item in pprop)) return ret_val diff --git a/src/factiva/analytics/snapshots/snapshot.py b/src/factiva/analytics/snapshots/snapshot.py index 4bdd604..9e2acdc 100644 --- a/src/factiva/analytics/snapshots/snapshot.py +++ b/src/factiva/analytics/snapshots/snapshot.py @@ -482,7 +482,7 @@ def process_update(self, update_type, download_path=None): -------- Process update job with type 'additions' >>> previous_snapshot = Snapshot(user_key=my_user, snapshot_id='sdjjekl93j') - >>> previous_snapshot.process_update('additions', download_path=f'./{previous_snapshot.snapshot_id}/additions/') + >>> previous_snapshot.process_update('additions', download_path=f"./{previous_snapshot.snapshot_id}/additions/") """ self.last_update_job = UpdateJob(update_type=update_type, snapshot_id=self.last_extraction_job.job_id) @@ -501,27 +501,27 @@ def __str__(self, detailed=True, prefix=' |-', root_prefix=''): child_prefix = ' | ' + prefix ret_val = str(self.__class__) + '\n' - ret_val += f'{prefix}user_key: ' + ret_val += f"{prefix}user_key: " ret_val += self.user_key.__str__(detailed=False, prefix=child_prefix) del pprop['user_key'] ret_val += '\n' - ret_val += f'{prefix}query: ' + ret_val += f"{prefix}query: " ret_val += self.query.__str__(detailed=False, prefix=child_prefix) del pprop['query'] ret_val += '\n' - ret_val += f'{prefix}last_explain_job: ' + ret_val += f"{prefix}last_explain_job: " ret_val += self.last_explain_job.__str__(detailed=False, prefix=child_prefix) del pprop['last_explain_job'] ret_val += '\n' - ret_val += f'{prefix}last_analytics_job: ' + ret_val += f"{prefix}last_analytics_job: " ret_val += self.last_analytics_job.__str__(detailed=False, prefix=child_prefix) del pprop['last_analytics_job'] ret_val += '\n' - ret_val += f'{prefix}last_extraction_job: ' + ret_val += f"{prefix}last_extraction_job: " ret_val += self.last_extraction_job.__str__(detailed=False, prefix=child_prefix) del pprop['last_extraction_job'] ret_val += '\n' diff --git a/src/factiva/analytics/snapshots/time_series.py b/src/factiva/analytics/snapshots/time_series.py index f357080..4e98e53 100644 --- a/src/factiva/analytics/snapshots/time_series.py +++ b/src/factiva/analytics/snapshots/time_series.py @@ -1,8 +1,10 @@ """ Classes to interact with the Snapshot Analytics (TimeSeries) endpoint """ +from io import StringIO import time import pandas as pd +from typing import Any, Optional from .base import SnapshotBase, SnapshotBaseQuery, SnapshotBaseJobResponse from ..common import log, const, tools, req @@ -27,14 +29,27 @@ class SnapshotTimeSeriesJobReponse(SnapshotBaseJobResponse): """ - data : pd.DataFrame = None - errors : list[dict] = None + _data : Optional[pd.DataFrame] = None + _download_link : Optional[str] = None + _errors : Optional[list[dict]] = None + # Override inherited properties with private variables + _job_id: Optional[str] = None + _job_link: Optional[str] = None + _job_state: Optional[str] = None # Consider adding calculated values for start/end date and the number # of records - def __init__(self, job_id: str = None) -> None: - super().__init__(job_id) + def __init__(self, job_id: Optional[str] = None) -> None: + # Initialize private variables directly to avoid property conflicts + # Initialize using private variables first + self._job_id = None + self._job_link = None + self._job_state = None + + # Then use property setter for validation if job_id is provided + if job_id is not None: + self.job_id = job_id # type: ignore[misc] def __repr__(self): @@ -43,7 +58,11 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val = super().__str__(detailed, prefix, root_prefix) - ret_val += f"{prefix}data: {tools.print_property(self.data)}" + if self.download_link: + ret_val += f"{prefix}download_link: {tools.print_property(self.download_link[0:20] + '...' + self.download_link[-20:])}" + else: + ret_val += f"{prefix}download_link: " + ret_val += f"\n{prefix}data: {tools.print_property(self.data)}" if self.errors: ret_val += f"\n{prefix.replace('├', '└')}errors: [{len(self.errors)}]" err_list = [f"\n{prefix[0:-1]} |-{err['title']}: {err['detail']}" for err in self.errors] @@ -54,12 +73,101 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): return ret_val + # Getter and Setter methods + @property + def data(self) -> Optional[pd.DataFrame]: + """Get the data DataFrame.""" + return self._data + + @data.setter + def data(self, value: Optional[pd.DataFrame]) -> None: + """Set the data DataFrame.""" + self._data = value + + @property + def download_link(self) -> Optional[str]: + """Get the download link.""" + return self._download_link + + @download_link.setter + def download_link(self, value: Optional[str]) -> None: + """Set the download link.""" + self._download_link = value + + @property + def errors(self) -> Optional[list[dict]]: + """Get the errors list.""" + return self._errors + + @errors.setter + def errors(self, value: Optional[list[dict]]) -> None: + """Set the errors list.""" + self._errors = value + + # Override inherited properties from base class + @property + def job_id(self) -> Optional[str]: + """Get the job ID.""" + return self._job_id + + @job_id.setter + def job_id(self, value: Optional[str]) -> None: + """Set the job ID with TimeSeries-specific validation. + + TimeSeries job IDs should follow the format: abcd1234-ab12-ab12-ab12-abcdef123456 + """ + if value is None: + raise ValueError("Job ID cannot be None") + + tools.validate_type(value, str, "Job ID must be a string") + + # Validate UUID format for TimeSeries jobs (36 characters with hyphens) + if len(value) != 36 or value.count('-') != 4: + raise ValueError("TimeSeries job ID must be in UUID format (e.g., abcd1234-ab12-ab12-ab12-abcdef123456)") + + # Additional validation: check if it matches UUID pattern + import re + uuid_pattern = r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$' + if not re.match(uuid_pattern, value.lower()): + raise ValueError("TimeSeries job ID must be a valid UUID format") + + self._job_id = value + + @property + def job_link(self) -> Optional[str]: + """Get the job link.""" + return self._job_link + + @job_link.setter + def job_link(self, value: Optional[str]) -> None: # type: ignore[misc] + """Set the job link with validation.""" + if value is not None: + tools.validate_type(value, str, "Job link must be a string") + self._job_link = value + + @property + def job_state(self) -> Optional[str]: + """Get the job state.""" + return self._job_state + + @job_state.setter + def job_state(self, value: Optional[str]) -> None: # type: ignore[misc] + """Set the job state with validation.""" + if value is not None: + tools.validate_type(value, str, "Job state must be a string") + # Validate against expected job states from constants + expected_states = [const.API_JOB_DONE_STATE, const.API_JOB_FAILED_STATE] + const.API_JOB_EXPECTED_STATES + if hasattr(const, 'API_JOB_EXPECTED_STATES') and value not in expected_states: + # Allow setting even if not in expected states for flexibility + pass + self._job_state = value + + class SnapshotTimeSeriesQuery(SnapshotBaseQuery): """ Snapshot Query for TimeSeries operations class. Used only in the context of - SnapshotTimeSeries, but can be transformed to other SnapshotQuery types when - those are created using an instance of this class as parameter. + SnapshotTimeSeries. Attributes ---------- @@ -77,26 +185,26 @@ class SnapshotTimeSeriesQuery(SnapshotBaseQuery): Time unit used to aggregate values in the time-series calculation date_field : str Schema date-time field used to calculate the time-series dataset - group_dimensions : list[str] - List of fields to break-down aggregates per time period unit + group_dimension : str + Field name to break-down aggregates per time period unit top : str Max entries per group_dimension per time period unit """ - frequency : str = None - date_field : str = None - group_dimensions : list[str] = None - top : int = None + _frequency : str = const.API_MONTH_PERIOD + _date_field : str = const.API_PUBLICATION_DATETIME_FIELD + _group_dimension : list[Any] | str + _top : Optional[int] = None def __init__(self, where=None, - includes: dict = None, - include_lists: dict = None, - excludes: dict = None, - exclude_lists: dict = None, + includes: Optional[dict] = None, + include_lists: Optional[dict] = None, + excludes: Optional[dict] = None, + exclude_lists: Optional[dict] = None, + group_dimension: Optional[list[Any] | str] = None, frequency: str = const.API_MONTH_PERIOD, date_field:str = const.API_PUBLICATION_DATETIME_FIELD, - group_dimensions: list = [], top: int = 10): """ Class constructor @@ -121,16 +229,16 @@ def __init__(self, Collection of bulk values to be removed from the selection criteria. Python dictionary with the format ``{column_name1: ['ListID1', 'listID2, ...], column_name2: ['listID1', 'listID2', ...]}``. - frequency : str, optional + frequency : str, optional (default: 'MONTH') Date part to be used to group subtotals in the time-series dataset. Allowed values are ``DAY``, ``MONTH`` (default) and ``YEAR``. - date_field : str, optional + date_field : str, optional (default: 'publication_datetime') Timestamp column that will be used to calculate the time-series dataset. It can be any of the three values: ``publication_datetime`` (default), ``modification_datetime``, and ``ingestion_datetime``. - group_dimensions : list[str], optional - List of fields that will be used to break-down subtotals for each period. This list can - have a maximum of 4 elements. Allowed values are ``['source_code', 'subject_codes', + group_dimension : str, optional (default: 'source_code') + Field name that will be used to break-down subtotals for each period. Allowed values are one of the following: + ``['source_code', 'subject_codes', 'region_codes', 'industry_codes', 'company_codes', 'person_codes', 'company_codes_about', 'company_codes_relevance', 'company_codes_cusip', 'company_codes_isin', 'company_codes_sedol', 'company_codes_ticker', 'company_codes_about_cusip', @@ -138,31 +246,22 @@ def __init__(self, 'company_codes_relevance_cusip', 'company_codes_relevance_isin', 'company_codes_relevance_sedol', 'company_codes_relevance_ticker']`` top : int, optional - Limits the dataset to return only the top X values for each dimension passed in the - ``group_dimensions`` parameter. Default 10. + Limits the dataset to return only the top X values for the dimension passed in the + ``group_dimension`` parameter. Default 10. Can be set to -1 to return all values. """ - super().__init__(where, includes, include_lists, excludes, exclude_lists) - - tools.validate_type(frequency, str, "Unexpected value for frequency") - frequency = frequency.upper().strip() - tools.validate_field_options(frequency, const.API_DATETIME_PERIODS) + super().__init__( + where, + includes if includes is not None else {}, + include_lists if include_lists is not None else {}, + excludes if excludes is not None else {}, + exclude_lists if exclude_lists is not None else {} + ) + + # Use property setters for validation self.frequency = frequency - - tools.validate_type(date_field, str, "Unexpected value for date_field") - date_field = date_field.lower().strip() - tools.validate_field_options(date_field, const.API_DATETIME_FIELDS) self.date_field = date_field - - if isinstance(group_dimensions, list): - self.group_dimensions = group_dimensions - # TODO: Validate values in the list group_dimensions are valid form the - # list of all possible columns that can be used for this purpose - - tools.validate_type(top, int, "Unexpected value for top") - if top >= 0: - self.top = top - else: - raise ValueError('Top value is not valid') + self.group_dimension = group_dimension + self.top = top def get_payload(self) -> dict: @@ -178,25 +277,12 @@ def get_payload(self) -> dict: """ payload = super().get_payload() - self.frequency = self.frequency.upper().strip() - tools.validate_field_options(self.frequency, const.API_DATETIME_PERIODS) payload["query"].update({"frequency": self.frequency}) - - self.date_field = self.date_field.lower().strip() - tools.validate_field_options(self.date_field, const.API_DATETIME_FIELDS) payload["query"].update({"date_field": self.date_field}) - if(self.group_dimensions): - if(len(self.group_dimensions)<=4): - for option in self.group_dimensions: - tools.validate_field_options(option, const.API_GROUP_DIMENSIONS_FIELDS) - else: - raise ValueError("The maximiun group_dimensions size is 4") - else: - self.group_dimensions = [] - - payload["query"].update( - {"group_dimensions": self.group_dimensions}) + if(self.group_dimension): + payload["query"].update( + {"group_dimensions": [self.group_dimension]}) payload["query"].update({"top": self.top}) @@ -209,14 +295,87 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val = super().__str__(detailed, prefix, root_prefix) - ret_val = ret_val.replace('└─...', '├─...') + ret_val = ret_val.replace('└─ex', '├─ex') ret_val += f"\n{prefix}frequency: {tools.print_property(self.frequency)}" ret_val += f"\n{prefix}date_field: {tools.print_property(self.date_field)}" - ret_val += f"\n{prefix}group_dimensions: {tools.print_property(self.group_dimensions)}" + ret_val += f"\n{prefix}group_dimension: {tools.print_property(self.group_dimension)}" ret_val += f"\n{prefix[0:-2]}└─top: {tools.print_property(self.top)}" return ret_val + # Getter and Setter methods + @property + def frequency(self) -> str: + """Get the frequency value.""" + return self._frequency + + @frequency.setter + def frequency(self, value: str) -> None: + """Set the frequency value with validation.""" + tools.validate_type(value, str, "Unexpected value for frequency") + value = value.upper().strip() + tools.validate_field_options(value, const.API_DATETIME_PERIODS) + self._frequency = value + + @property + def date_field(self) -> str: + """Get the date_field value.""" + return self._date_field + + @date_field.setter + def date_field(self, value: str) -> None: + """Set the date_field value with validation.""" + tools.validate_type(value, str, "Unexpected value for date_field") + value = value.lower().strip() + tools.validate_field_options(value, const.API_DATETIME_FIELDS) + self._date_field = value + + @property + def group_dimension(self) -> list[Any] | str: + """Get the group_dimension value.""" + return self._group_dimension + + @group_dimension.setter + def group_dimension(self, value: Optional[list[Any] | str]) -> None: + """Set the group_dimension value with validation. + + If value is a string, sets it directly. + If value is a list/array, sets only the first value. + """ + if value: + # Handle string case + if isinstance(value, str): + if value in const.API_GROUP_DIMENSIONS_FIELDS: + self._group_dimension = value + else: + raise ValueError('Group dimension is not valid') + # Handle list/array case - use first value + elif isinstance(value, (list, tuple)) and len(value) > 0: + first_value = value[0] + if isinstance(first_value, str) and first_value in const.API_GROUP_DIMENSIONS_FIELDS: + self._group_dimension = first_value + else: + raise ValueError('Group dimension is not valid') + else: + raise ValueError('Group dimension must be a string or non-empty list/array') + else: + self._group_dimension = [] + + @property + def top(self) -> Optional[int]: + """Get the top value.""" + return self._top + + @top.setter + def top(self, value: int) -> None: + """Set the top value with validation.""" + tools.validate_type(value, int, "Unexpected value for top") + if value >= -1: + self._top = value + else: + raise ValueError('Top value must be an ingeger greater than or equal to -1') + + class SnapshotTimeSeries(SnapshotBase): """ @@ -233,21 +392,23 @@ class SnapshotTimeSeries(SnapshotBase): """ - query : SnapshotTimeSeriesQuery = None - job_response : SnapshotTimeSeriesJobReponse = None + from typing import Optional + + query : Optional[SnapshotTimeSeriesQuery] = None + job_response : Optional[SnapshotTimeSeriesJobReponse] = None def __init__( self, + job_id=None, user_key=None, - query=None, - job_id=None + query: Optional[SnapshotBaseQuery] = None ): super().__init__(user_key=user_key, query=query, job_id=job_id) self.__log = log.get_factiva_logger() - self.__JOB_BASE_URL = f'{const.API_HOST}{const.API_ANALYTICS_BASEPATH}' + self.__JOB_BASE_URL = f"{const.API_HOST}{const.API_ANALYTICS_BASEPATH}" if job_id: - self.__log.info(f'Creating SnapshotTimeSeries instance with JobID {job_id}') + self.__log.info(f"Creating SnapshotTimeSeries instance with JobID {job_id}") self.job_response = SnapshotTimeSeriesJobReponse(job_id) self.get_job_response() elif query: @@ -258,13 +419,13 @@ def __init__( else: raise ValueError('Unexpected query type') else: - self.query = SnapshotTimeSeriesQuery() + self.query = SnapshotTimeSeriesQuery() # type: ignore[assignment] self.__log.info('SnapshotExtraction created OK') @log.factiva_logger - def submit_job(self): + def submit_job(self, payload=None): """ Performs a POST request to the API using the assigned query to start a TimeSeries job. @@ -282,28 +443,30 @@ def submit_job(self): if not self.query: raise ValueError('A query is needed to submit an Explain Job') + if not self.user_key: + raise ValueError('User key is required for API requests') + headers_dict = { 'user-key': self.user_key.key, - 'Content-Type': 'application/json', - 'X-API-VERSION': '2.0' + 'Content-Type': 'application/json' } - submit_url = f'{self.__JOB_BASE_URL}' + submit_url = f"{self.__JOB_BASE_URL}" submit_payload = self.query.get_payload() response = req.api_send_request(method='POST', endpoint_url=submit_url, headers=headers_dict, payload=submit_payload) if response.status_code == 201: response_data = response.json() - self.job_response = SnapshotTimeSeriesJobReponse(response_data["data"]["id"]) + self.job_response = SnapshotTimeSeriesJobReponse(response_data["data"]["id"]) # type: ignore[assignment] self.job_response.job_state = response_data['data']['attributes']['current_state'] self.job_response.job_link = response_data['links']['self'] if 'errors' in response_data.keys(): self.job_response.errors = response_data['errors'] elif response.status_code == 400: - raise ValueError(f'Invalid Query [{response.text}]') + raise ValueError(f"Invalid Query [{response.text}]") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('submit_job End') return True @@ -326,31 +489,53 @@ def get_job_response(self) -> bool: if (not self.job_response): raise RuntimeError('Job has not yet been submitted or Job ID was not set') + if not self.user_key: + raise ValueError('User key is required for API requests') + headers_dict = { 'user-key': self.user_key.key, 'Content-Type': 'application/json' } - self.__log.info(f'Requesting Analytics Job info for ID {self.job_response.job_id}') - getinfo_url = f'{self.__JOB_BASE_URL}/{self.job_response.job_id}' + self.__log.info(f"Requesting Analytics Job info for ID {self.job_response.job_id}") + getinfo_url = f"{self.__JOB_BASE_URL}/{self.job_response.job_id}" response = req.api_send_request(method='GET', endpoint_url=getinfo_url, headers=headers_dict) + if response.status_code == 422: + headers_dict.update( + {'X-API-VERSION': '2.0'} + ) + self.__log.info(f"Retrying get Analytics Job info with X-API-VERSION 2.0 info for ID {self.job_response.job_id}") + response = req.api_send_request(method='GET', endpoint_url=getinfo_url, headers=headers_dict) + if response.status_code == 200: - self.__log.info(f'Job ID {self.job_response.job_id} info retrieved successfully') + self.__log.info(f"Job ID {self.job_response.job_id} info retrieved successfully") response_data = response.json() self.job_response.job_state = response_data['data']['attributes']['current_state'] self.job_response.job_link = response_data['links']['self'] if self.job_response.job_state == const.API_JOB_DONE_STATE: - self.job_response.data = pd.DataFrame(response_data['data']['attributes']['results']) + if 'results' in response_data['data']['attributes'].keys(): + self.job_response.data = pd.DataFrame(response_data['data']['attributes']['results']) + else: + self.job_response.download_link = response_data['data']['attributes']['download_link'] if 'errors' in response_data.keys(): self.job_response.errors = response_data['errors'] elif response.status_code == 404: raise RuntimeError('Job ID does not exist.') elif response.status_code == 400: - detail = response_data['errors'][0]['detail'] - raise ValueError(f'Bad Request: {detail}') + detail = response.json()['errors'][0]['detail'] + raise ValueError(f"Bad Request: {detail}") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") + if self.job_response.download_link: + self.__log.info(f"Downloading TimeSeries response file from {self.job_response.download_link.split('/')[-1]}") + response = req.api_send_request(method='GET', endpoint_url=self.job_response.download_link, headers=headers_dict) + if response.status_code == 200: + decoded_response = response.content.decode('utf-8') + jsonl_io = StringIO(decoded_response) + self.job_response.data = pd.read_json(jsonl_io, lines=True) + else: + raise RuntimeError(f"TimeSeries results file download error: [{response.text}]") self.__log.info('get_job_response End') return True @@ -371,6 +556,9 @@ def process_job(self): # TODO: Implement Retries if a 500 or timeout is returne self.submit_job() self.get_job_response() + if not self.job_response: + raise RuntimeError('Job response is not available') + while not (self.job_response.job_state in [const.API_JOB_DONE_STATE, const.API_JOB_FAILED_STATE] diff --git a/src/factiva/analytics/streams/__init__.py b/src/factiva/analytics/streams/__init__.py index 1547175..a6e4cca 100755 --- a/src/factiva/analytics/streams/__init__.py +++ b/src/factiva/analytics/streams/__init__.py @@ -2,7 +2,8 @@ Describe modules included for stream module. """ -__all__ = ['StreamingInstance', 'StreamingQuery', 'StreamingSubscription'] +__all__ = ['StreamingInstance', 'StreamingQuery', 'StreamingSubscription', + 'StreamingInstanceList', 'StreamingInstanceListItem'] -from .streaming_instance import StreamingInstance, StreamingQuery, StreamingSubscription +from .streaming_instance import StreamingInstance, StreamingQuery, StreamingSubscription, StreamingInstanceList, StreamingInstanceListItem # from .listener import Listener diff --git a/src/factiva/analytics/streams/listener.py b/src/factiva/analytics/streams/listener.py index f2d7e97..e8a0ecf 100644 --- a/src/factiva/analytics/streams/listener.py +++ b/src/factiva/analytics/streams/listener.py @@ -97,7 +97,7 @@ def stream_id_uri(self): """Property for retrieving the stream id uri.""" host = self.user_key.get_uri_context() stream_id = '-'.join(self.subscription_id.split("-")[:-2]) - return f'{host}/streams/{stream_id}' + return f"{host}/streams/{stream_id}" # pylint: disable=too-many-arguments diff --git a/src/factiva/analytics/streams/stream.py b/src/factiva/analytics/streams/stream.py index 1d1e9aa..6108567 100644 --- a/src/factiva/analytics/streams/stream.py +++ b/src/factiva/analytics/streams/stream.py @@ -78,7 +78,7 @@ def __init__( @property def stream_url(self) -> str: """List Stream's URL address.""" - return f'{common.API_HOST}{common.API_STREAMS_BASEPATH}' + return f"{common.API_HOST}{common.API_STREAMS_BASEPATH}" @property def all_subscriptions(self) -> List[str]: @@ -187,7 +187,7 @@ def delete(self) -> StreamResponse: if not self.stream_id: raise common.UNDEFINED_STREAM_ID_ERROR - uri = f'{self.stream_url}/{self.stream_id}' + uri = f"{self.stream_url}/{self.stream_id}" headers = { 'user-key': self.stream_user.key, 'content-type': 'application/json' @@ -258,10 +258,10 @@ def create_subscription(self) -> str: return new_subscription.id except Exception as error: raise RuntimeError( - f''' + f""" Unexpected error happened while creating the subscription: {error} - ''' + """ ) @factiva_logger @@ -437,7 +437,7 @@ def _create_by_snapshot_id(self) -> StreamResponse: 'user-key': self.stream_user.key, 'content-type': 'application/json' } - uri = f'{common.API_HOST}{common.API_SNAPSHOTS_BASEPATH}/{self.snapshot_id}/streams' + uri = f"{common.API_HOST}{common.API_SNAPSHOTS_BASEPATH}/{self.snapshot_id}/streams" response = req.api_send_request( method='POST', endpoint_url=uri, @@ -508,27 +508,27 @@ def __str__(self, detailed=True, prefix=' |-', root_prefix=''): child_prefix = ' | ' + prefix ret_val = str(self.__class__) + '\n' - ret_val += f'{prefix}user_key: ' + ret_val += f"{prefix}user_key: " ret_val += self.user_key.__str__() del pprop['user_key'] ret_val += '\n' - ret_val += f'{prefix}query: ' + ret_val += f"{prefix}query: " ret_val += self.query.__str__(detailed=False, prefix=child_prefix) del pprop['query'] ret_val += '\n' - ret_val += f'{prefix}last_explain_job: ' + ret_val += f"{prefix}last_explain_job: " ret_val += self.last_explain_job.__str__(detailed=False, prefix=child_prefix) del pprop['last_explain_job'] ret_val += '\n' - ret_val += f'{prefix}last_analytics_job: ' + ret_val += f"{prefix}last_analytics_job: " ret_val += self.last_analytics_job.__str__(detailed=False, prefix=child_prefix) del pprop['last_analytics_job'] ret_val += '\n' - ret_val += f'{prefix}last_extraction_job: ' + ret_val += f"{prefix}last_extraction_job: " ret_val += self.last_extraction_job.__str__(detailed=False, prefix=child_prefix) del pprop['last_extraction_job'] ret_val += '\n' diff --git a/src/factiva/analytics/streams/stream_response.py b/src/factiva/analytics/streams/stream_response.py index 8b2b24d..cf69d2c 100644 --- a/src/factiva/analytics/streams/stream_response.py +++ b/src/factiva/analytics/streams/stream_response.py @@ -107,12 +107,12 @@ def parse_object(self, data, level=2): idents = "\t" * level for index_k, index_v in data.items(): if isinstance(index_v, dict): - object_repr += f'{idents}{index_k}: \n{self.parse_object(index_v, level + 1)}\n' + object_repr += f"{idents}{index_k}: \n{self.parse_object(index_v, level + 1)}\n" elif isinstance(index_v, list): for att in index_v: - object_repr += f'{idents}{index_k}: \n{self.parse_object(att, level + 1)}\n' + object_repr += f"{idents}{index_k}: \n{self.parse_object(att, level + 1)}\n" else: - object_repr += f'{idents}{index_k}: {index_v}\n' + object_repr += f"{idents}{index_k}: {index_v}\n" return object_repr diff --git a/src/factiva/analytics/streams/streaming_instance.py b/src/factiva/analytics/streams/streaming_instance.py index ae50232..8620cf1 100644 --- a/src/factiva/analytics/streams/streaming_instance.py +++ b/src/factiva/analytics/streams/streaming_instance.py @@ -2,6 +2,7 @@ Module containing all clases that interact with the Factiva Analytics - Streams service """ import time +import pandas as pd from ..auth import UserKey from ..snapshots.base import SnapshotBaseQuery from ..common import log, const, req, config, tools @@ -35,8 +36,8 @@ def __repr__(self): def __str__(self, table=False, prefix=' ├─', root_prefix=''): if not table: - ret_val = f"{root_prefix}8}" return ret_val @@ -99,8 +100,8 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): - ret_val = f"{root_prefix} 80 else self.where # if detailed: ret_val += f"\n{prefix}includes: " @@ -127,7 +128,7 @@ class StreamingInstance(): def __init__(self, id=None, query=None, user_key=None) -> None: self.__log = log.get_factiva_logger() - self.__JOB_BASE_URL = f'{const.API_HOST}{const.API_STREAMS_BASEPATH}' + self.__JOB_BASE_URL = f"{const.API_HOST}{const.API_STREAMS_BASEPATH}" self.status = 'NOT_CREATED' self.__log.info('creating StreamingInstance...') @@ -144,7 +145,7 @@ def __init__(self, id=None, query=None, user_key=None) -> None: raise ValueError("The query and id parameters cannot be assigned simultaneously") if id: - self.__log.info(f'Creating a StreamingInstance with ID {id}') + self.__log.info(f"Creating a StreamingInstance with ID {id}") # Considers two types of IDs: # - dj-synhub-stream-lufcwmlbrmmpg1p1kmq9c1ex8blcnqdu-obhztjwvqa # - obhztjwvqa @@ -153,7 +154,7 @@ def __init__(self, id=None, query=None, user_key=None) -> None: self.short_id = id.split('-')[-1] elif len(id) == 10: self.short_id = id - self.id = f'dj-synhub-stream-{self.user_key.key.lower()}-{id}' + self.id = f"dj-synhub-stream-{self.user_key.key.lower()}-{id}" self.get_status() elif query: if isinstance(query, StreamingQuery): @@ -216,15 +217,15 @@ def create(self): const.API_JOB_FAILED_STATE, const.API_JOB_RUNNING_STATE]): if self.status not in const.API_JOB_EXPECTED_STATES: - raise RuntimeError(f'Unexpected job status: {self.status}') + raise RuntimeError(f"Unexpected job status: {self.status}") time.sleep(const.API_JOB_ACTIVE_WAIT_SPACING) self.get_status() if self.status in [const.API_JOB_CANCELLED_STATE, const.API_JOB_FAILED_STATE]: - raise RuntimeError(f'StreamingInstance creation failed with status: {self.status}') + raise RuntimeError(f"StreamingInstance creation failed with status: {self.status}") elif response.status_code == 400: - raise ValueError(f'Invalid Query [{response.text}]') + raise ValueError(f"Invalid Query [{response.text}]") else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('submit_job OK') return True @@ -258,7 +259,7 @@ def get_status(self): 'Content-Type': 'application/json' } - status_url = f'{self.__JOB_BASE_URL}/{self.id}' + status_url = f"{self.__JOB_BASE_URL}/{self.id}" response = req.api_send_request(method='GET', endpoint_url=status_url, headers=headers_dict) if response.status_code == 200: @@ -269,7 +270,7 @@ def get_status(self): for sub in resp_subscriptions: self.subscriptions.append(StreamingSubscription(sub['id'], self.user_key)) else: - raise RuntimeError(f'API request returned an unexpected HTTP status, with content [{response.text}]') + raise RuntimeError(f"API request returned an unexpected HTTP status, with content [{response.text}]") self.__log.info('get_status OK') return True @@ -280,7 +281,7 @@ def __repr__(self): def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): - ret_val = f"{root_prefix}" ret_val += f"\n{prefix}short_id: {tools.print_property(self.short_id)}" @@ -308,3 +309,71 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val += f"\n{prefix.replace('├', '└')}status: {tools.print_property(self.status)}" return ret_val + + +# Implement StreamingInstanceListItem and StreamingInstanceList classes +class StreamingInstanceListItem(): + + id: str = None + short_id: str = None + job_status: str = None + n_subscriptions: str = None + + def __init__(self, id:str=None, + short_id:str=None, + job_status:str=None, + n_subscriptions:str=None) -> None: + self.id = id + self.short_id = short_id + self.job_status = job_status + self.n_subscriptions = n_subscriptions + + + def __repr__(self): + return self.__str__() + + + def __str__(self, prefix=' ├─', root_prefix='', row=True, index=None): + if row: + if index is not None: + prefix = f"{prefix}[{index:<3}] " + ret_val = f"{prefix}{self.short_id:<12} {self.job_status:<22} {self.n_subscriptions:<16}\n" + else: + ret_val = f"{root_prefix}<'factiva.analytics.{str(self.__class__).split('.')[-1]}\n" + ret_val += f"{prefix}short_id: {self.short_id}\n" + ret_val += f"{prefix}job_status: {self.job_status}\n" + ret_val += f"{prefix}n_subscriptions: {self.n_subscriptions}" + return ret_val + + +class StreamingInstanceList(list): + + items: list[StreamingInstanceListItem] = None + + + def __init__(self, df_streams: pd.DataFrame = None) -> None: + self.items = [] + if df_streams is not None: + for index, row in df_streams.iterrows(): + self.items.append(StreamingInstanceListItem( + id=row['stream_id'], + short_id=row['short_id'], + job_status=row['job_status'], + n_subscriptions=row['n_subscriptions'] + )) + + + def __getitem__(self, index): + return StreamingInstance(self.items[index].short_id) + + + def __repr__(self): + return self.__str__() + + + def __str__(self, prefix=' ├─'): + ret_val = f"<'factiva.analytics.{str(self.__class__).split('.')[-1]}\n" + ret_val += f"{prefix} {'short_id':<12} {'job_status':<22} {'n_subscriptions':<16}\n" + for ix, item in enumerate(self.items): + ret_val += item.__str__(row=True, index=ix) + return ret_val diff --git a/src/factiva/analytics/streams/subscription.py b/src/factiva/analytics/streams/subscription.py index 0d21a4d..2d2fb25 100644 --- a/src/factiva/analytics/streams/subscription.py +++ b/src/factiva/analytics/streams/subscription.py @@ -52,7 +52,7 @@ def __init__(self, stream_id=None, id=None, subscription_type=None): except Exception: raise common.UNDEFINED_STREAM_ID_ERROR - self.url = f'{common.API_HOST}{common.API_STREAMS_BASEPATH}' + self.url = f"{common.API_HOST}{common.API_STREAMS_BASEPATH}" self.stream_id = stream_id # pylint: disable=invalid-name self.id = id @@ -61,7 +61,7 @@ def __init__(self, stream_id=None, id=None, subscription_type=None): def __repr__(self): """Create string representation for Subscription Class.""" - return f'Subscription(id={self.id}, type={self.subscription_type})' + return f"Subscription(id={self.id}, type={self.subscription_type})" @factiva_logger def create_listener(self, user): diff --git a/src/factiva/analytics/taxonomy/company_identifiers.py b/src/factiva/analytics/taxonomy/company_identifiers.py index ef85792..d8c540d 100644 --- a/src/factiva/analytics/taxonomy/company_identifiers.py +++ b/src/factiva/analytics/taxonomy/company_identifiers.py @@ -1,15 +1,13 @@ from ..common import tools from ..common import req -from .. import (UserKey, factiva_logger, get_factiva_logger) -from ..common import (API_COMPANIES_IDENTIFIER_TYPE, API_HOST, - API_SNAPSHOTS_COMPANIES_BASEPATH, - API_SNAPSHOTS_COMPANIES_PIT, - API_SNAPSHOTS_TAXONOMY_BASEPATH, - DOWNLOAD_DEFAULT_FOLDER, - TICKER_COMPANY_IDENTIFIER) +from ..common import log +from ..common import tools +from ..common import const +from ..auth import UserKey +import pandas as pd -class Company(): +class FactivaCompany(): """Class that represents the company available within the Snapshots API. Parameters @@ -29,8 +27,8 @@ class Company(): >>> c = Company(user_key=u) """ - __API_ENDPOINT_TAXONOMY = f'{API_HOST}{API_SNAPSHOTS_TAXONOMY_BASEPATH}' - __API_ENDPOINT_COMPANY = f'{API_HOST}{API_SNAPSHOTS_COMPANIES_BASEPATH}' + __API_ENDPOINT_TAXONOMY = f"{const.API_HOST}{const.API_SNAPSHOTS_TAXONOMY_BASEPATH}" + __API_ENDPOINT_COMPANY = f"{const.API_HOST}{const.API_SNAPSHOTS_COMPANIES_BASEPATH}" __TICKER_COMPANY_IDENTIFIER_NAME = 'ticker_exchange' user_key=None @@ -38,10 +36,11 @@ class Company(): def __init__(self, user_key=None): """Class initializar""" self.user_key = UserKey(user_key, True) - self.log= get_factiva_logger() + # self.log = log.get_factiva_logger() + self.__log = log.get_factiva_logger() - @factiva_logger + @log.factiva_logger def get_identifiers(self) -> list: """Request for a list of available taxonomy categories. @@ -70,7 +69,7 @@ def get_identifiers(self) -> list: 'user-key': self.user_key.key } - endpoint = f'{common.API_HOST}{common.API_SNAPSHOTS_COMPANY_IDENTIFIERS_BASEPATH}' + endpoint = f"{const.API_HOST}{const.API_SNAPSHOTS_COMPANY_IDENTIFIERS_BASEPATH}" response = req.api_send_request(method='GET', endpoint_url=endpoint, headers=headers_dict) @@ -80,7 +79,7 @@ def get_identifiers(self) -> list: raise RuntimeError('API Request returned an unexpected HTTP status') - @factiva_logger + @log.factiva_logger def validate_point_time_request(self, identifier): """Validate if the user is allowes to perform company operation and if the identifier given is valid @@ -98,9 +97,9 @@ def validate_point_time_request(self, identifier): if (not len(self.user_key.enabled_company_identifiers)): raise ValueError('User is not allowed to perform this operation') - tools.validate_field_options(identifier, API_COMPANIES_IDENTIFIER_TYPE) + tools.validate_field_options(identifier, const.API_COMPANIES_IDENTIFIER_TYPE) - if (identifier == TICKER_COMPANY_IDENTIFIER): + if (identifier == const.TICKER_COMPANY_IDENTIFIER): identifier = self.__TICKER_COMPANY_IDENTIFIER_NAME identifier_description = list( @@ -109,7 +108,7 @@ def validate_point_time_request(self, identifier): if (not len(identifier_description)): raise ValueError('User is not allowed to perform this operation') - @factiva_logger + @log.factiva_logger def point_in_time_download_all(self, identifier, file_name, @@ -145,17 +144,19 @@ def point_in_time_download_all(self, self.validate_point_time_request(identifier) if (to_save_path is None): + # TODO: Fix this! + DOWNLOAD_DEFAULT_FOLDER = 'fix this' to_save_path = DOWNLOAD_DEFAULT_FOLDER headers_dict = {'user-key': self.user_key.key} - endpoint = f'{self.__API_ENDPOINT_TAXONOMY}{API_SNAPSHOTS_COMPANIES_PIT}/{identifier}/{file_format}' + endpoint = f"{self.__API_ENDPOINT_TAXONOMY}{const.API_SNAPSHOTS_COMPANIES_PIT}/{identifier}/{file_format}" local_file_name = req.download_file(endpoint, headers_dict, file_name, file_format, to_save_path, add_timestamp) return local_file_name - @factiva_logger + @log.factiva_logger def point_in_time_query(self, identifier, value) -> dict: """Returns the resolved Factiva code and date ranges when the instrument from the identifier, was valid. @@ -179,7 +180,7 @@ def point_in_time_query(self, identifier, value) -> dict: self.validate_point_time_request(identifier) headers_dict = {'user-key': self.user_key.key} - endpoint = f'{self.__API_ENDPOINT_COMPANY}{API_SNAPSHOTS_COMPANIES_PIT}/{identifier}/{value}' + endpoint = f"{self.__API_ENDPOINT_COMPANY}{const.API_SNAPSHOTS_COMPANIES_PIT}/{identifier}/{value}" response = req.api_send_request(endpoint_url=endpoint, headers=headers_dict) @@ -227,7 +228,7 @@ def get_single_company(self, code_type, company_code) -> pd.DataFrame: 'user-key': self.user_key.key } - endpoint = f'{common.API_HOST}{common.API_SNAPSHOTS_COMPANIES_BASEPATH}/{code_type}/{company_code}' + endpoint = f"{const.API_HOST}{const.API_SNAPSHOTS_COMPANIES_BASEPATH}/{code_type}/{company_code}" response = req.api_send_request(method='GET', endpoint_url=endpoint, headers=headers_dict) @@ -237,6 +238,7 @@ def get_single_company(self, code_type, company_code) -> pd.DataFrame: raise RuntimeError('API Request returned an unexpected HTTP status') + @log.factiva_logger def get_multiple_companies(self, code_type, company_codes) -> pd.DataFrame: """ @@ -286,14 +288,15 @@ def get_multiple_companies(self, code_type, company_codes) -> pd.DataFrame: } } - endpoint = f'{common.API_HOST}{common.API_SNAPSHOTS_COMPANIES_BASEPATH}/{code_type}' + endpoint = f"{const.API_HOST}{const.API_SNAPSHOTS_COMPANIES_BASEPATH}/{code_type}" response = req.api_send_request(method='POST', endpoint_url=endpoint, headers=headers_dict, payload=payload_dict) if response.status_code == 200 or response.status_code == 207: response_data = response.json() return pd.DataFrame.from_records(response_data['data']['attributes']['successes']) - raise RuntimeError(f'API Request returned an unexpected HTTP status with message: {response.text}') + raise RuntimeError(f"API Request returned an unexpected HTTP status with message: {response.text}") + @log.factiva_logger def get_company(self, code_type, company_codes) -> pd.DataFrame: diff --git a/src/factiva/analytics/taxonomy/factiva_taxonomies.py b/src/factiva/analytics/taxonomy/factiva_taxonomies.py index 2ae431f..d957983 100644 --- a/src/factiva/analytics/taxonomy/factiva_taxonomies.py +++ b/src/factiva/analytics/taxonomy/factiva_taxonomies.py @@ -87,7 +87,7 @@ class FactivaTaxonomy(): """ - __TAXONOMY_BASEURL = f'{const.API_HOST}{const.API_SNAPSHOTS_TAXONOMY_BASEPATH}' + __TAXONOMY_BASEURL = f"{const.API_HOST}{const.API_SNAPSHOTS_TAXONOMY_BASEPATH}" all_subjects = None all_regions = None @@ -100,7 +100,8 @@ def __init__(self, user_key=None): self.user_key = user_key else: self.user_key = UserKey(user_key) - self.log= log.get_factiva_logger() + # self.log= log.get_factiva_logger() + self.__log = log.get_factiva_logger() self.all_subjects = None self.all_regions = None self.all_industries = None @@ -138,7 +139,7 @@ def get_category_codes(self, category:FactivaTaxonomyCategories) -> pd.DataFrame from factiva.analytics import FactivaTaxonomy, FactivaTaxonomyCategories t = FactivaTaxonomy() industry_codes = t.get_category_codes(FactivaTaxonomyCategories.INDUSTRIES) - industry_codes + print(industry_codes) .. code-block:: @@ -165,7 +166,7 @@ def get_category_codes(self, category:FactivaTaxonomyCategories) -> pd.DataFrame headers_dict = { 'user-key': self.user_key.key } - endpoint = f'{self.__TAXONOMY_BASEURL}/{category.value}/{response_format}' + endpoint = f"{self.__TAXONOMY_BASEURL}/{category.value}/{response_format}" response = req.api_send_request(method='GET', endpoint_url=endpoint, headers=headers_dict, stream=True) if response.status_code == 200: @@ -248,7 +249,7 @@ def download_raw_category(self, category:FactivaTaxonomyCategories, path=None, f raise ValueError('The file_format parameter must be either csv or avro.') if not path: path = os.getcwd() - endpoint = f'{self.__TAXONOMY_BASEURL}/{category.value}/{file_format}' + endpoint = f"{self.__TAXONOMY_BASEURL}/{category.value}/{file_format}" download_headers = { 'user-key': self.user_key.key } @@ -260,6 +261,7 @@ def download_raw_category(self, category:FactivaTaxonomyCategories, path=None, f return True + @log.factiva_logger def lookup_code(self, code:str, category:FactivaTaxonomyCategories) -> dict: """ Finds the descriptor and other details based on the provide code and @@ -343,9 +345,9 @@ def lookup_code(self, code:str, category:FactivaTaxonomyCategories) -> dict: return f_df.iloc[0].to_dict() # When code is not found - return {'error': f'Code {code} not found in {category.value}', + return {'error': f"Code {code} not found in {category.value}", 'code': 'UNKNOWN', - 'descriptor': f'ERR: Code {code} not found in {category.value}'} + 'descriptor': f"ERR: Code {code} not found in {category.value}"} def __repr__(self): @@ -364,9 +366,9 @@ def __str__(self, detailed=True, prefix=' ├─', root_prefix=''): ret_val += f"{prefix}user_key: {self.user_key.__str__(detailed=False, prefix=' │ ├─')}\n" if detailed: - ret_val += '\n'.join((f'{prefix}{item}: {tools.print_property(pprop[item])}' for item in pprop)) + ret_val += '\n'.join((f"{prefix}{item}: {tools.print_property(pprop[item])}" for item in pprop)) ret_val += f"\n{prefix[0:-2]}└─all_companies: {tools.print_property(self.all_companies)}" else: - ret_val += f'\n{prefix[0:-2]}└─...' + ret_val += f"\n{prefix[0:-2]}└─..." return ret_val diff --git a/test/article_retrieval/test_articleretrieval.py b/test/article_fetcher/test_articlefetcher.py similarity index 60% rename from test/article_retrieval/test_articleretrieval.py rename to test/article_fetcher/test_articlefetcher.py index ee4aa91..7f73117 100755 --- a/test/article_retrieval/test_articleretrieval.py +++ b/test/article_fetcher/test_articlefetcher.py @@ -1,10 +1,12 @@ """ - Tests for the ArticleRetrieval module + Tests for the ArticleFetcher module """ import pytest -from factiva.analytics import OAuthUser, ArticleRetrieval, UIArticle -from factiva.analytics.common import config +import time +from factiva.analytics import OAuthUser, ArticleFetcher, UIArticle +from factiva.analytics.common import config, const +GITHUB_CI = config.load_environment_value('CI', False) FACTIVA_CLIENTID = config.load_environment_value("FACTIVA_CLIENTID") FACTIVA_USERNAME = config.load_environment_value("FACTIVA_USERNAME") FACTIVA_PASSWORD = config.load_environment_value("FACTIVA_PASSWORD") @@ -26,21 +28,25 @@ def _assert_uiarticle_values(uiarticle: UIArticle): assert uiarticle.source_name == 'The Wall Street Journal Online' -def test_article_retrieval_env_user(): +def test_article_fetcher_env_user(): """" - Creates the object using the ENV variable and request the usage details to the API service + Creates the object using the ENV variable and request the article content to the API service """ - ar = ArticleRetrieval() - article = ar.retrieve_single_article(ARTICLE_ID) + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) + ar = ArticleFetcher() + article = ar.fetch_single_article(ARTICLE_ID) _assert_uiarticle_values(article) -def test_article_retrieval_params_user(): +def test_article_fetcher_params_user(): + """ + Creates the object using the passed params and request the article content to the API service + """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) o = OAuthUser(client_id=FACTIVA_CLIENTID, username=FACTIVA_USERNAME, password=FACTIVA_PASSWORD) - ar = ArticleRetrieval(oauth_user=o) - article = ar.retrieve_single_article(ARTICLE_ID) + ar = ArticleFetcher(oauth_user=o) + article = ar.fetch_single_article(ARTICLE_ID) _assert_uiarticle_values(article) - diff --git a/test/auth/test_accountinfo.py b/test/auth/test_accountinfo.py new file mode 100755 index 0000000..b685218 --- /dev/null +++ b/test/auth/test_accountinfo.py @@ -0,0 +1,93 @@ +""" + Tests for the AccountInfo module +""" +import pytest +import time +from factiva.analytics import AccountInfo +from factiva.analytics.common import config, const + +GITHUB_CI = config.load_environment_value('CI', False) +FACTIVA_USERKEY = config.load_environment_value("FACTIVA_USERKEY") +DUMMY_KEY = 'abcd1234abcd1234abcd1234abcd1234' + + +def _test_accountinfo_types(usr): + """" + Checks the correct types were returned. + """ + if isinstance(usr, str): + usr = AccountInfo(stats=True) + assert isinstance(usr.user_key.key, str) + assert isinstance(usr.user_key.cloud_token, dict) + assert isinstance(usr.account_name, str) + assert isinstance(usr.active_product, str) + # assert isinstance(usr.max_allowed_concurrent_extractions, int) + assert isinstance(usr.max_allowed_extracted_documents, int) + assert isinstance(usr.max_allowed_extractions, int) + assert isinstance(usr.remaining_documents, int) + assert isinstance(usr.remaining_extractions, int) + # assert isinstance(usr.total_downloaded_bytes, int or str) + assert isinstance(usr.total_extracted_documents, int) + assert isinstance(usr.total_extractions, int) + assert isinstance(usr.total_stream_instances, int) + assert isinstance(usr.total_stream_subscriptions, int) + assert isinstance(usr.enabled_company_identifiers, list) + # Assert streams + # Assert extractions + + +def _test_accountinfo_values(usr): + """ + Checks if values within the expected lengths and ranges + were returned + """ + if isinstance(usr, str): + usr = AccountInfo(stats=True) + assert usr.user_key.key == FACTIVA_USERKEY + assert len(usr.account_name) >= 0 + assert len(usr.active_product) >= 0 + # assert usr.max_allowed_concurrent_extractions >= 0 + assert usr.max_allowed_extracted_documents >= 0 + assert usr.max_allowed_extractions >= 0 + # assert usr.total_downloaded_bytes >= 0 + assert usr.total_extracted_documents >= 0 + assert usr.total_extractions >= 0 + assert usr.total_stream_instances >= 0 + assert usr.total_stream_subscriptions >= 0 + assert len(usr.enabled_company_identifiers) >= 0 + +def test_accountinfo_invalid_key(): + """ + Creates an object from the provided string and request the usage details to the API service + The key is invalid and this should validate how the error is processed + """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) + with pytest.raises(ValueError, match=r'Factiva User-Key does not exist or inactive.'): + AccountInfo(DUMMY_KEY) + + +def test_accountinfo_invald_lenght_key(): + """ + Attempts to create an object with malformed keys. This requires assert the raised exception. + """ + with pytest.raises(ValueError, match=r'Factiva User-Key has the wrong length'): + AccountInfo('abc') + +def test_accountinfo_with_stats(): + """" + Creates the object using the ENV variable and request the usage details to the API service + """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) + usr = AccountInfo() + _test_accountinfo_types(usr) + _test_accountinfo_values(usr) + + +def test_accountinfo_with_parameter_and_stats(): + """ + API Key is passed as a string + """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) + usr = AccountInfo(FACTIVA_USERKEY) + _test_accountinfo_types(usr) + _test_accountinfo_values(usr) diff --git a/test/auth/test_oauthuser.py b/test/auth/test_oauthuser.py index 60ee944..44de528 100755 --- a/test/auth/test_oauthuser.py +++ b/test/auth/test_oauthuser.py @@ -2,9 +2,11 @@ Tests for the UserKey module """ import pytest +import time from factiva.analytics import OAuthUser -from factiva.analytics.common import config +from factiva.analytics.common import config, const +GITHUB_CI = config.load_environment_value('CI', False) FACTIVA_CLIENTID = config.load_environment_value("FACTIVA_CLIENTID") FACTIVA_USERNAME = config.load_environment_value("FACTIVA_USERNAME") FACTIVA_PASSWORD = config.load_environment_value("FACTIVA_PASSWORD") @@ -50,6 +52,7 @@ def test_wrong_credentials(): Creates an object from the provided string The key is invalid and this should validate how the error is processed """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) with pytest.raises(PermissionError, match=r'Invalid user credentials'): o = OAuthUser(client_id='client_id_value', username='username_value', diff --git a/test/auth/test_userkey.py b/test/auth/test_userkey.py index 13b79fa..261c2b9 100755 --- a/test/auth/test_userkey.py +++ b/test/auth/test_userkey.py @@ -2,126 +2,33 @@ Tests for the UserKey module """ import pytest +import time from factiva.analytics import UserKey -from factiva.analytics.common import config +from factiva.analytics.common import config, const +GITHUB_CI = config.load_environment_value('CI', False) FACTIVA_USERKEY = config.load_environment_value("FACTIVA_USERKEY") DUMMY_KEY = 'abcd1234abcd1234abcd1234abcd1234' -# API Response sample with the most complete set of attributes -# { -# "data": { -# "id": "abcd1234abcd1234abcd1234abcd1234", -# "attributes": { -# "cnt_curr_ext": 1, -# "current_downloaded_amount": 427567508, -# "max_allowed_concurrent_extracts": 10, -# "max_allowed_document_extracts": 2500000, -# "max_allowed_extracts": 5, -# "name": "Company Corp", -# "products": "DNA", -# "tot_document_extracts": 1595383, -# "tot_extracts": 4, -# "tot_subscriptions": 0, -# "tot_topics": 0, -# "licensed_company_ids": [ -# 4, -# 3, -# 1, -# 5 -# ], -# "enabled_company_identifiers": [ -# { -# "id": 4, -# "name": "isin" -# }, -# { -# "id": 3, -# "name": "cusip" -# }, -# { -# "id": 1, -# "name": "sedol" -# }, -# { -# "id": 5, -# "name": "ticker_exchange" -# } -# ] -# }, -# "type": "account_with_contract_limits" -# } -# } - -def _test_userkey_types(usr): - """" - Checks the correct types were returned. - """ - if isinstance(usr, str): - usr = UserKey(stats=True) - assert isinstance(usr.key, str) - assert isinstance(usr.cloud_token, dict) - assert isinstance(usr.account_name, str) - assert isinstance(usr.active_product, str) - assert isinstance(usr.max_allowed_concurrent_extractions, int) - assert isinstance(usr.max_allowed_extracted_documents, int) - assert isinstance(usr.max_allowed_extractions, int) - assert isinstance(usr.remaining_documents, int) - assert isinstance(usr.remaining_extractions, int) - assert isinstance(usr.total_downloaded_bytes, int) - assert isinstance(usr.total_extracted_documents, int) - assert isinstance(usr.total_extractions, int) - assert isinstance(usr.total_stream_instances, int) - assert isinstance(usr.total_stream_subscriptions, int) - assert isinstance(usr.enabled_company_identifiers, list) - - -def _test_userkey_values(usr): - """ - Checks if values within the expected lengths and ranges - were returned - """ - if isinstance(usr, str): - usr = UserKey(stats=True) - assert usr.key == FACTIVA_USERKEY - assert len(usr.account_name) >= 0 - assert len(usr.active_product) >= 0 - assert usr.max_allowed_concurrent_extractions >= 0 - assert usr.max_allowed_extracted_documents >= 0 - assert usr.max_allowed_extractions >= 0 - assert usr.total_downloaded_bytes >= 0 - assert usr.total_extracted_documents >= 0 - assert usr.total_extractions >= 0 - assert usr.total_stream_instances >= 0 - assert usr.total_stream_subscriptions >= 0 - assert len(usr.enabled_company_identifiers) >= 0 - -def test_userkey_with_stats(): - """" - Creates the object using the ENV variable and request the usage details to the API service - """ - usr = UserKey(stats=True) - _test_userkey_types(usr) - _test_userkey_values(usr) - - -def test_userkey_without_stats(): +def test_userkey_from_env(): """ Creates an empty object from the ENV variable with a value only for the key property """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) usr = UserKey() - _test_userkey_types(usr) - _test_userkey_values(usr) + assert usr.key == FACTIVA_USERKEY + assert isinstance(usr.cloud_token, dict) def test_user_with_parameter_and_stats(): """ API Key is passed as a string and stats=True """ - usr = UserKey(key=FACTIVA_USERKEY, stats=True) - _test_userkey_types(usr) - _test_userkey_values(usr) + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) + usr = UserKey(FACTIVA_USERKEY) + assert usr.key == FACTIVA_USERKEY + assert isinstance(usr.cloud_token, dict) def test_invalid_key(): @@ -129,8 +36,9 @@ def test_invalid_key(): Creates an object from the provided string and request the usage details to the API service The key is invalid and this should validate how the error is processed """ + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) with pytest.raises(ValueError, match=r'Factiva User-Key does not exist or inactive.'): - UserKey(DUMMY_KEY, stats=True) + UserKey(DUMMY_KEY) def test_invald_lenght_key(): diff --git a/test/readme.md b/test/readme.md index ee01f0c..ff9fdfe 100644 --- a/test/readme.md +++ b/test/readme.md @@ -1,4 +1,4 @@ -# Test notes for the factiva-news Python package +# Test notes for the factiva-analytics Python Library General notes that describe the context the package is expected to be tested, and other considerations. ## Runtime diff --git a/test/snapshots/test_explain.py b/test/snapshots/test_explain.py index 8c6ee74..ca5362a 100644 --- a/test/snapshots/test_explain.py +++ b/test/snapshots/test_explain.py @@ -1,4 +1,5 @@ import pytest +import time from factiva.analytics.common import config, const from factiva.analytics import SnapshotExplain, UserKey, SnapshotExplainQuery import pandas as pd @@ -13,6 +14,7 @@ # Test operations before any API request def test_create_from_envuser(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExplain() assert isinstance(se, SnapshotExplain) assert se.user_key.key == ENVIRONMENT_USER_KEY @@ -23,6 +25,7 @@ def test_create_from_envuser(): } def test_create_from_user_param(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExplain(user_key=VALID_USER_KEY) assert isinstance(se, SnapshotExplain) assert se.user_key.key == VALID_USER_KEY @@ -33,6 +36,7 @@ def test_create_from_user_param(): } def test_create_from_userkey(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) u = UserKey() assert isinstance(u, UserKey) se = SnapshotExplain(user_key=u) @@ -45,6 +49,7 @@ def test_create_from_userkey(): } def test_create_envuser_where(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExplain(query=VALID_WHERE_STATEMENT) assert isinstance(se, SnapshotExplain) assert se.user_key.key == ENVIRONMENT_USER_KEY @@ -55,6 +60,7 @@ def test_create_envuser_where(): } def test_create_envuser_envwhere(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) seq = SnapshotExplainQuery() assert isinstance(seq, SnapshotExplainQuery) se = SnapshotExplain(query=seq) @@ -67,6 +73,8 @@ def test_create_envuser_envwhere(): } def test_failed_where_and_jobid(): + if GITHUB_CI: + pytest.skip("Not to be tested in GitHub Actions") with pytest.raises(ValueError, match=r'The query and job_id parameters*'): se = SnapshotExplain(query=VALID_WHERE_STATEMENT, job_id='abcd1234-ab12-ab12-ab12-abcdef123456') assert isinstance(se, SnapshotExplain) @@ -78,6 +86,7 @@ def test_failed_where_and_jobid(): def test_job_envwhere_samples(): if GITHUB_CI: pytest.skip("Not to be tested in GitHub Actions") + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExplain() assert isinstance(se, SnapshotExplain) assert se.process_job() diff --git a/test/snapshots/test_extraction.py b/test/snapshots/test_extraction.py index 361de87..84fd22d 100644 --- a/test/snapshots/test_extraction.py +++ b/test/snapshots/test_extraction.py @@ -1,4 +1,5 @@ import pytest +import time from factiva.analytics import SnapshotExtraction, UserKey, SnapshotExtractionQuery from factiva.analytics.common import config, const @@ -12,28 +13,33 @@ # Test operations before any API request def test_create_from_envuser(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExtraction() assert isinstance(se, SnapshotExtraction) assert se.user_key.key == ENVIRONMENT_USER_KEY assert se.query.get_payload() == { 'query': { 'where': ENVIRONMENT_WHERE_STATEMENT, - 'format': 'avro' + 'format': 'avro', + 'shards': 25 } } def test_create_from_user_param(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExtraction(user_key=VALID_USER_KEY) assert isinstance(se, SnapshotExtraction) assert se.user_key.key == VALID_USER_KEY assert se.query.get_payload() == { 'query': { 'where': ENVIRONMENT_WHERE_STATEMENT, - 'format': 'avro' + 'format': 'avro', + 'shards': 25 } } def test_create_from_userkey(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) u = UserKey() assert isinstance(u, UserKey) se = SnapshotExtraction(user_key=u) @@ -42,22 +48,26 @@ def test_create_from_userkey(): assert se.query.get_payload() == { 'query': { 'where': ENVIRONMENT_WHERE_STATEMENT, - 'format': 'avro' + 'format': 'avro', + 'shards': 25 } } def test_create_envuser_where(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) se = SnapshotExtraction(query=VALID_WHERE_STATEMENT) assert isinstance(se, SnapshotExtraction) assert se.user_key.key == ENVIRONMENT_USER_KEY assert se.query.get_payload() == { 'query': { 'where': VALID_WHERE_STATEMENT, - 'format': 'avro' + 'format': 'avro', + 'shards': 25 } } def test_create_envuser_envwhere(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) seq = SnapshotExtractionQuery() assert isinstance(seq, SnapshotExtractionQuery) se = SnapshotExtraction(query=seq) @@ -66,11 +76,13 @@ def test_create_envuser_envwhere(): assert se.query.get_payload() == { 'query': { 'where': ENVIRONMENT_WHERE_STATEMENT, - 'format': 'avro' + 'format': 'avro', + 'shards': 25 } } def test_failed_where_and_jobid(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) with pytest.raises(ValueError, match=r'The query and job_id parameters*'): se = SnapshotExtraction(query=VALID_WHERE_STATEMENT, job_id='abcd1234-ab12-ab12-ab12-abcdef123456') assert isinstance(se, SnapshotExtraction) @@ -83,7 +95,9 @@ def test_failed_where_and_jobid(): def test_job_envuser_envwhere(): if GITHUB_CI: pytest.skip("Not to be tested in GitHub Actions") + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) sts = SnapshotExtraction() + sts.query.limit = 1000 assert isinstance(sts, SnapshotExtraction) assert sts.process_job() assert sts.job_response.job_state == const.API_JOB_DONE_STATE diff --git a/test/snapshots/test_timeseries.py b/test/snapshots/test_timeseries.py index d24cc27..99cee7b 100644 --- a/test/snapshots/test_timeseries.py +++ b/test/snapshots/test_timeseries.py @@ -1,4 +1,5 @@ import pytest +import time import pandas as pd from factiva.analytics import SnapshotTimeSeries, UserKey, SnapshotTimeSeriesQuery from factiva.analytics.common import config, const @@ -12,6 +13,7 @@ def test_create_from_envuser(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) sts = SnapshotTimeSeries() assert isinstance(sts, SnapshotTimeSeries) assert sts.user_key.key == ENVIRONMENT_USER_KEY @@ -20,12 +22,12 @@ def test_create_from_envuser(): 'where': ENVIRONMENT_WHERE_STATEMENT, 'frequency': const.API_MONTH_PERIOD, 'date_field': const.API_PUBLICATION_DATETIME_FIELD, - 'group_dimensions': [], 'top': 10 } } def test_create_from_user_param(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) sts = SnapshotTimeSeries(user_key=VALID_USER_KEY) assert isinstance(sts, SnapshotTimeSeries) assert sts.user_key.key == VALID_USER_KEY @@ -34,12 +36,12 @@ def test_create_from_user_param(): 'where': ENVIRONMENT_WHERE_STATEMENT, 'frequency': const.API_MONTH_PERIOD, 'date_field': const.API_PUBLICATION_DATETIME_FIELD, - 'group_dimensions': [], 'top': 10 } } def test_create_from_userkey(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) u = UserKey() assert isinstance(u, UserKey) sts = SnapshotTimeSeries(user_key=u) @@ -50,12 +52,12 @@ def test_create_from_userkey(): 'where': ENVIRONMENT_WHERE_STATEMENT, 'frequency': const.API_MONTH_PERIOD, 'date_field': const.API_PUBLICATION_DATETIME_FIELD, - 'group_dimensions': [], 'top': 10 } } def test_create_envuser_where(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) sts = SnapshotTimeSeries(query=VALID_WHERE_STATEMENT) assert isinstance(sts, SnapshotTimeSeries) assert sts.user_key.key == ENVIRONMENT_USER_KEY @@ -64,12 +66,12 @@ def test_create_envuser_where(): 'where': VALID_WHERE_STATEMENT, 'frequency': const.API_MONTH_PERIOD, 'date_field': const.API_PUBLICATION_DATETIME_FIELD, - 'group_dimensions': [], 'top': 10 } } def test_create_envuser_envwhere(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) seq = SnapshotTimeSeriesQuery() assert isinstance(seq, SnapshotTimeSeriesQuery) sts = SnapshotTimeSeries(query=seq) @@ -80,12 +82,13 @@ def test_create_envuser_envwhere(): 'where': ENVIRONMENT_WHERE_STATEMENT, 'frequency': const.API_MONTH_PERIOD, 'date_field': const.API_PUBLICATION_DATETIME_FIELD, - 'group_dimensions': [], 'top': 10 } } def test_failed_where_and_jobid(): + if GITHUB_CI: + pytest.skip("Not to be tested in GitHub Actions") with pytest.raises(ValueError, match=r'The query and job_id parameters*'): sts = SnapshotTimeSeries(query=VALID_WHERE_STATEMENT, job_id='abcd1234-ab12-ab12-ab12-abcdef123456') assert isinstance(sts, SnapshotTimeSeries) @@ -93,11 +96,12 @@ def test_failed_where_and_jobid(): # Test operations sending requests to the API # These are only executed when running locally. For optimisation purposes -# no API tests are executed in the CI/CD (GitHub Actions) environment. +# no heavy API tests are executed in the CI/CD (GitHub Actions) environment. def test_job_envuser_envwhere(): if GITHUB_CI: pytest.skip("Not to be tested in GitHub Actions") + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) sts = SnapshotTimeSeries() assert isinstance(sts, SnapshotTimeSeries) assert sts.process_job() @@ -108,4 +112,4 @@ def test_job_envuser_envwhere(): assert (sts.job_response.errors == None) assert isinstance(sts.job_response.data, pd.DataFrame) assert len(sts.job_response.data.columns) >= 2 - + assert len(sts.job_response.data) > 0 diff --git a/test/taxonomy/test_factivataxonomy.py b/test/taxonomy/test_factivataxonomy.py index 48fb2c6..ffaaa1b 100644 --- a/test/taxonomy/test_factivataxonomy.py +++ b/test/taxonomy/test_factivataxonomy.py @@ -1,8 +1,11 @@ import os +import pytest +import time import pandas as pd from factiva.analytics import FactivaTaxonomy, FactivaTaxonomyCategories, UserKey -from factiva.analytics.common import config +from factiva.analytics.common import config, const +GITHUB_CI = config.load_environment_value('CI', False) FACTIVA_USERKEY = config.load_environment_value("FACTIVA_USERKEY") SAVE_PATH = os.getcwd() @@ -17,23 +20,26 @@ def test_create_taxonomy_instance_str_user(): def test_create_taxonomy_instance_userkey_user(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) u = UserKey() t = FactivaTaxonomy(user_key=u) assert t.user_key.key == FACTIVA_USERKEY def test_download_category_file(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) t = FactivaTaxonomy() assert t.download_raw_category(FactivaTaxonomyCategories.INDUSTRIES, path=SAVE_PATH) assert t.download_raw_category(FactivaTaxonomyCategories.INDUSTRIES, path=SAVE_PATH, file_format='avro') try: - os.remove(f'{SAVE_PATH}/{FactivaTaxonomyCategories.INDUSTRIES.value}.csv') - os.remove(f'{SAVE_PATH}/{FactivaTaxonomyCategories.INDUSTRIES.value}.avro') + os.remove(f"{SAVE_PATH}/{FactivaTaxonomyCategories.INDUSTRIES.value}.csv") + os.remove(f"{SAVE_PATH}/{FactivaTaxonomyCategories.INDUSTRIES.value}.avro") except: pass def test_get_category_codes(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) t = FactivaTaxonomy() industries = t.get_category_codes(FactivaTaxonomyCategories.INDUSTRIES) assert isinstance(industries, pd.DataFrame) @@ -43,6 +49,7 @@ def test_get_category_codes(): def test_lookup_code_good(): + time.sleep(const.TEST_REQUEST_SPACING_SECONDS) t = FactivaTaxonomy() assert t.all_subjects == None mcat = t.lookup_code('MCAT', FactivaTaxonomyCategories.SUBJECTS) diff --git a/test/test_listener_handlers.txt b/test/test_listener_handlers.txt index ad930c6..0890160 100644 --- a/test/test_listener_handlers.txt +++ b/test/test_listener_handlers.txt @@ -37,7 +37,7 @@ class TestListenerTools(unittest.TestCase): stream_short_id = BASIC_SUBSCRIPTION_ID.split('-')[-3] action = BASIC_ADD_MESSAGE['action'] current_hour = datetime.datetime.utcnow().strftime('%Y%m%d%H') - file_result = f'{stream_short_id}_{action}_{current_hour}.jsonl' + file_result = f"{stream_short_id}_{action}_{current_hour}.jsonl" assert result == True assert os.path.exists( os.path.join(const.LISTENER_FILES_DEFAULT_FOLDER, file_result)) == True diff --git a/test/test_request.txt b/test/test_request.txt index 6149f81..2176108 100644 --- a/test/test_request.txt +++ b/test/test_request.txt @@ -15,7 +15,7 @@ headers_dict = {'user-key': FACTIVA_USERKEY} def dowload_pit_url(identifier, file_format): - return f'{API_HOST}{API_SNAPSHOTS_TAXONOMY_BASEPATH}{API_SNAPSHOTS_COMPANIES_PIT}/{identifier}/{file_format}' + return f"{API_HOST}{API_SNAPSHOTS_TAXONOMY_BASEPATH}{API_SNAPSHOTS_COMPANIES_PIT}/{identifier}/{file_format}" class TestRequests(unittest.TestCase):