Merge pull request #944 from econdb/main

bashtage · web-flow · commit 9cd961c8044d · 2023-10-24T11:25:33.000+01:00
Updates in docstrings and test cases
diff --git a/docs/source/remote_data.rst b/docs/source/remote_data.rst
@@ -305,12 +305,26 @@ for United States, is as simple as taking the ticker segment from the URL path
 
 .. code-block:: ipython
 
-    import os
-    import pandas_datareader.data as web
+    In [1]: import os
 
-    f = web.DataReader('ticker=RGDPUS', 'econdb')
-    f.head()
+    In [2]: import pandas_datareader as pdr
 
+    In [3]: f = pdr.get_data_econdb('ticker=RGDPUS')
+    In [4]: f.head()
+    Out[4]:
+    TableName                                                      T10106
+    SeriesCode                                                     A191RX
+    Table                 Table 1.1.6. Real Gross Domestic Product, Ch...
+    Series description                             Gross domestic product
+    CL_UNIT                                                         Level
+    CL_FREQ                                                             Q
+    Note                  Table 1.1.6. Real Gross Domestic Product, Ch...
+    TIME_PERIOD
+    2018-01-01                                                   18437128
+    2018-04-01                                                   18565696
+    2018-07-01                                                   18699748
+    2018-10-01                                                   18733740
+    2019-01-01                                                   18835412
 
 The code snippet for exporting the whole dataset, or its filtered down subset,
 can be generated by using the Export -> Pandas Python3 functionality
@@ -319,16 +333,47 @@ such as the Eurostat's `GDP and main components <https://www.econdb.com/dataset/
 
 .. code-block:: ipython
 
-    import os
-    import pandas_datareader.data as web
+    In [1]: import os
 
-    df = web.DataReader('dataset=NAMQ_10_GDP&v=Geopolitical entity (reporting)&h=TIME&from=2018-05-01&to=2021-01-01&GEO=[AL,AT,BE,BA,BG,HR,CY,CZ,DK,EE,EA19,FI,FR,DE,EL,HU,IS,IE,IT,XK,LV,LT,LU,MT,ME,NL,MK,NO,PL,PT,RO,RS,SK,SI,ES,SE,CH,TR,UK]&NA_ITEM=[B1GQ]&S_ADJ=[SCA]&UNIT=[CLV10_MNAC]', 'econdb')
-    df.columns
+    In [2]: import pandas_datareader as pdr
+
+    In [3]: df = pdr.get_data_econdb('dataset=NAMQ_10_GDP&v=Geopolitical entity (reporting)'
+                                     '&h=TIME&from=2018-05-01&to=2021-01-01'
+                                     '&GEO=[UK,ES,IT,DE,FR,CH,AT]&NA_ITEM=[B1GQ]'
+                                     '&S_ADJ=[SCA]&UNIT=[CLV10_MNAC]')
+    In [4]: df.head()
+    Out[4]:
+    Frequency                                                                                 Quarterly  ...
+    Unit of measure                     Chain linked volumes (2010), million units of national currency  ...
+    Seasonal adjustment                                           Seasonally and calendar adjusted data  ...
+    National accounts indicator (ESA10)                         Gross domestic product at market prices  ...
+    Geopolitical entity (reporting)                                                             Austria  ... Switzerland
+    TIME_PERIOD                                                                                          ...
+    2018-07-01                                                                       83427               ...      181338
+    2018-10-01                                                                       84268               ...      181767
+    2019-01-01                                                                       84919               ...      182039
+    2019-04-01                                                                       84476               ...      182848
+    2019-07-01                                                                       84822               ...      183866
+
+In both cases, metadata for the requested Econdb series or dataset
+is in the ``MultiIndex`` columns of the returned ``DataFrame``,
+and can be conveniently converted to a ``dict`` as demonstrated below
+
+.. code-block:: ipython
+
+    In [5]: meta = df.columns.to_frame().iloc[0].to_dict()  # first column, positionally
+    Out[5]: meta
+    {'Frequency': 'Quarterly',
+     'Unit of measure': 'Chain linked volumes (2010), million units of national currency',
+     'Seasonal adjustment': 'Seasonally and calendar adjusted data',
+     'National accounts indicator (ESA10)': 'Gross domestic product at market prices',
+     'Geopolitical entity (reporting)': 'Austria'}
 
 Datasets can be located through Econdb's `search <https://www.econdb.com/search>`__
 engine, or discovered by exploring the `tree <https://www.econdb.com/tree/>`__
 of available statistical sources.
 
+
 .. _remote_data.enigma:
 
 Enigma
diff --git a/pandas_datareader/__init__.py b/pandas_datareader/__init__.py
@@ -8,6 +8,7 @@
     get_components_yahoo,
     get_dailysummary_iex,
     get_data_alphavantage,
+    get_data_econdb,
     get_data_enigma,
     get_data_famafrench,
     get_data_fred,
@@ -38,6 +39,7 @@
 __all__ = [
     "__version__",
     "get_components_yahoo",
+    "get_data_econdb",
     "get_data_enigma",
     "get_data_famafrench",
     "get_data_yahoo",
diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py
@@ -41,6 +41,7 @@
 
 __all__ = [
     "get_components_yahoo",
+    "get_data_econdb",
     "get_data_enigma",
     "get_data_famafrench",
     "get_data_fred",
@@ -80,6 +81,10 @@ def get_data_yahoo(*args, **kwargs):
     return YahooDailyReader(*args, **kwargs).read()
 
 
+def get_data_econdb(*args, **kwargs):
+    return EcondbReader(*args, **kwargs).read()
+
+
 def get_data_enigma(*args, **kwargs):
     return EnigmaReader(*args, **kwargs).read()
 
diff --git a/pandas_datareader/econdb.py b/pandas_datareader/econdb.py
@@ -4,18 +4,65 @@
 
 
 class EcondbReader(_BaseReader):
-    """Get data for the given name from Econdb."""
+    """
+    Returns DataFrame of historical stock prices from symbol, over date
+    range, start to end.
+
+    .. versionadded:: 0.5.0
+
+    Parameters
+    ----------
+    symbols : string
+        Can be in two different formats:
+        1. 'ticker=<code>' for fetching a single series,
+        where <code> is CPIUS for, e.g. the series at
+        https://www.econdb.com/series/CPIUS/
+        2. 'dataset=<dataset>&<params>' for fetching full
+        or filtered subset of a dataset, like the one at
+        https://www.econdb.com/dataset/ABS_GDP. After choosing the desired filters,
+        the correctly formatted query string can be easily generated
+        from that dataset's page by using the Export function, and choosing Pandas Python3.
+    start : string, int, date, datetime, Timestamp
+        Starting date. Parses many different kind of date
+        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+    end : string, int, date, datetime, Timestamp
+        Ending date
+    retry_count : int, default 3
+        Number of times to retry query request.
+    pause : int, default 0.1
+        Time, in seconds, to pause between consecutive queries of chunks. If
+        single value given for symbol, represents the pause between retries.
+    session : Session, default None
+        requests.sessions.Session instance to be used
+    """
 
     _URL = "https://www.econdb.com/api/series/"
     _format = None
     _show = "labels"
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(
+        self,
+        symbols,
+        start=None,
+        end=None,
+        retry_count=3,
+        pause=0.1,
+        session=None,
+        freq=None,
+    ):
+        super(EcondbReader, self).__init__(
+            symbols=symbols,
+            start=start,
+            end=end,
+            retry_count=retry_count,
+            pause=pause,
+            session=session,
+            freq=freq,
+        )
         params = dict(s.split("=") for s in self.symbols.split("&"))
-        if "from" in params and not kwargs.get("start"):
+        if "from" in params and not start:
             self.start = pd.to_datetime(params["from"], format="%Y-%m-%d")
-        if "to" in params and not kwargs.get("end"):
+        if "to" in params and not end:
             self.end = pd.to_datetime(params["to"], format="%Y-%m-%d")
 
     @property
@@ -43,10 +90,13 @@ def show_func(x):
             def show_func(x):
                 return x[: x.find(":")]
 
+        unique_keys = {k for s in results for k in s["additional_metadata"]}
         for entry in results:
             series = pd.DataFrame(entry["data"])[["dates", "values"]].set_index("dates")
             head = entry["additional_metadata"]
-
+            for k in unique_keys:
+                if k not in head:
+                    head[k] = "-1:None"
             if head != "":  # this additional metadata is not blank
                 series.columns = pd.MultiIndex.from_tuples(
                     [[show_func(x) for x in head.values()]],
diff --git a/pandas_datareader/tests/test_econdb.py b/pandas_datareader/tests/test_econdb.py
@@ -1,14 +1,33 @@
 import numpy as np
 import pandas as pd
-from pandas import testing as tm
 import pytest
 
 from pandas_datareader import data as web
 
 pytestmark = pytest.mark.stable
 
 
+def assert_equal(x, y):
+    assert np.isclose(x, y, rtol=1e-2)
+
+
 class TestEcondb(object):
+
+    def test_override_start_end(self):
+        df = web.DataReader(
+            '&'.join([
+                'dataset=RBI_BULLETIN',
+                'v=TIME',
+                'h=Indicator',
+                'from=2022-01-01',
+                'to=2022-07-01'
+            ]),
+            'econdb',
+            start='2020-01-01',
+            end='2022-01-01'
+        )
+        assert isinstance(df.index, pd.DatetimeIndex)
+
     def test_infer_start_end_from_symbols(self):
         df = web.DataReader(
             (
@@ -23,88 +42,69 @@ def test_infer_start_end_from_symbols(self):
         assert df.index[0].year == 2010
         assert df.index[-1].year == 2018
 
-    @pytest.mark.xfail(reason="Dataset does not exist on Econdb")
-    def test_get_cdh_e_fos(self):
-        # EUROSTAT
-        # Employed doctorate holders in non managerial and non professional
-        # occupations by fields of science (%)
-        df = web.DataReader(
-            "dataset=CDH_E_FOS&GEO=NO,PL,PT,RU&FOS07=FOS1&Y_GRAD=TOTAL",
-            "econdb",
-            start=pd.Timestamp("2005-01-01"),
-            end=pd.Timestamp("2010-01-01"),
-        )
-        assert isinstance(df, pd.DataFrame)
-        assert df.shape == (2, 4)
-
-        # the levels and not returned consistently for econdb
-        names = list(df.columns.names)
-        levels = [lvl.values.tolist() for lvl in list(df.columns.levels)]
-
-        exp_col = pd.MultiIndex.from_product(levels, names=names)
-        exp_idx = pd.DatetimeIndex(["2006-01-01", "2009-01-01"], name="TIME_PERIOD")
-
-        values = np.array([[25.49, np.nan, 39.05, np.nan], [20.38, 25.1, 27.77, 38.1]])
-        expected = pd.DataFrame(values, index=exp_idx, columns=exp_col)
-        tm.assert_frame_equal(df, expected)
-
-    def test_get_tourism(self):
-        # OECD
-        # TOURISM_INBOUND
+    tickers = [
+        f"{sec}{geo}"
+        for sec in ["RGDP", "CPI", "URATE"]
+        for geo in ["US", "UK", "ES", "AR"]
+    ]
 
+    @pytest.mark.parametrize("ticker", tickers)
+    def test_fetch_single_ticker_series(self, ticker):
         df = web.DataReader(
-            "dataset=OE_TOURISM_INBOUND&COUNTRY=JPN,USA&VARIABLE=INB_ARRIVALS_TOTAL",
+            f"ticker={ticker}",
             "econdb",
-            start=pd.Timestamp("2008-01-01"),
-            end=pd.Timestamp("2012-01-01"),
-        )
-        df = df.astype(float)
-        jp = np.array([8351000, 6790000, 8611000, 6219000, 8368000], dtype=float)
-        us = np.array(
-            [175702304, 160507424, 164079728, 167600272, 171320416], dtype=float
+            start=pd.Timestamp("2010-01-01"),
+            end=pd.Timestamp("2013-01-27"),
         )
-        index = pd.date_range("2008-01-01", "2012-01-01", freq="AS", name="TIME_PERIOD")
+        assert df.shape[1] == 1
+        assert isinstance(df.index, pd.DatetimeIndex)
 
-        # check the values coming back are equal
-        np.testing.assert_array_equal(df.values[:, 0], jp)
-        np.testing.assert_array_equal(df.values[:, 1], us)
-
-        # sometimes the country and variable columns are swapped
-        df = df.swaplevel(2, 1, axis=1)
-        for label, values in [("Japan", jp), ("United States", us)]:
-            expected = pd.Series(
-                values, index=index, name="Total international arrivals"
-            )
-            expected.index.freq = None
-            tm.assert_series_equal(
-                df[label]["Tourism demand surveys"]["Total international arrivals"],
-                expected,
-            )
-
-    def test_bls(self):
-        # BLS
-        # CPI
+    def test_single_nonticker_series(self):
         df = web.DataReader(
             "ticker=BLS_CU.CUSR0000SA0.M.US",
             "econdb",
             start=pd.Timestamp("2010-01-01"),
             end=pd.Timestamp("2013-01-27"),
         )
+        assert df.shape[1] == 1
+        assert isinstance(df.index, pd.DatetimeIndex)
+        assert_equal(df.loc["2010-05-01"][0], 217.3)
 
-        assert df.loc["2010-05-01"][0] == 217.3
+    def test_filtered_dataset(self):
+        df = web.DataReader(
+            "&".join(
+                [
+                    "dataset=PRC_HICP_MIDX",
+                    "v=Geopolitical entity (reporting)",
+                    "h=TIME",
+                    "from=2022-03-01",
+                    "to=2022-09-01",
+                    "COICOP=[CP00]",
+                    "FREQ=[M]",
+                    "GEO=[ES,AT,CZ,IT,CH]",
+                    "UNIT=[I15]",
+                ]
+            ),
+            "econdb",
+        )
+        assert df.shape[1] == 5
+        assert isinstance(df.index, pd.DatetimeIndex)
 
     def test_australia_gdp(self):
         df = web.DataReader(
-            "dataset=ABS_GDP&to=2019-09-01&from=1959-09-01&h=TIME&v=Indicator", "econdb"
-        )
-        assert (
-            df.loc[
-                "2017-10-01",
-                (
-                    "GDP per capita: Current prices - National Accounts",
-                    "Seasonally Adjusted",
-                    "AUD",
-                ),
-            ]
-            == 18329
+            "&".join(
+                [
+                    "dataset=ABS_GDP",
+                    "4=[7]",
+                    "6=[11]",
+                    "16=[1267]",
+                    "v=TIME",
+                    "h=Indicator",
+                    "from=2019-10-01",
+                    "to=2022-06-01",
+                    "GEO=[13]",
+                ]
+            ),
+            "econdb",
         )
+        assert_equal(df.squeeze().loc["2020-10-01"], 508603)